예제 #1
0
def sanity_check_primers_for_chimera(primer_filename, p_filename):
    """
    Check that primers are in order of F0, R0, F1, R1, .....
    """
    cur_index = 0
    cur_head = 'F'
    last = None
    f = open(p_filename, 'w')
    for r in smart_fa_fq_reader(primer_filename):
        expected_id = cur_head + str(cur_index)
        if r.id != expected_id:
            print >> sys.stderr, "expected id {0} but got {1}. Bad ID. Abort!".format(expected_id, r.id)
            sys.exit(-1)
        if cur_head == 'R':
            tmp_r = str(last.seq.reverse_complement())
            tmp_seq = str(r.seq)
            if tmp_seq.find(tmp_r) >= 0 or tmp_r.find(tmp_seq) >= 0:
                print >> sys.stderr, "F{0}/R{0} primer pair are reverse-complementarily identical. Adding 'AAAA' in 3' to distinguish".format(cur_index)
                f.write(">{0}\n{1}\n>{2}\nAAAA{3}\n".format(last.id, last.seq, r.id, r.seq))
                f.write(">{0}_revcomp\n{1}\n>{2}_revcomp\n{3}TTTT\n".format(last.id, last.seq.reverse_complement(), r.id, r.seq.reverse_complement()))
            else:
                f.write(">{0}\n{1}\n>{2}\n{3}\n".format(last.id, last.seq, r.id, r.seq))
                f.write(">{0}_revcomp\n{1}\n>{2}_revcomp\n{3}\n".format(last.id, last.seq.reverse_complement(), r.id, r.seq.reverse_complement()))
            cur_index += 1
        last = r
        cur_head = 'R' if cur_head == 'F' else 'F'
    f.close()
    return range(cur_index)
예제 #2
0
def remove_chimeras(fasta_filename, suspicious_hits, max_adjacent_hit_distance, output_fq=False):
    """
    Output written to <fasta_filename>.non_chimera.fa/fq and <fasta_filename>.is_chimera.fa/fq
    """
    def is_chimera(dom_records):
        """
        Find at least two records such that they are adjacent but NOT entirely overlapping
        """
        if len(dom_records) <= 1: return False
        dom_records.sort(key=lambda x: x.sStart)
        for i in xrange(len(dom_records)-1):
            r1 = dom_records[i]
            s1 = r1.sEnd - r1.sStart
            for j in xrange(i+1, len(dom_records)):
                r2 = dom_records[j]
                s2 = r2.sEnd - r2.sStart
                d = r2.sStart - r1.sEnd
                if d > max_adjacent_hit_distance: break
                elif 0 <= d < max_adjacent_hit_distance:
                    return True
                elif d < 0: # d < 0, has overlap
                    d = abs(d)
                    if d < .5 * s1 and d < .5 * s2: # acceptable overlap for considering them as separate hits (hence is chimera)
                        return True
        return False

    if not output_fq:
        f_chimera = open(fasta_filename + '.is_chimera.fa', 'w')
        f_nonchimera = open(fasta_filename + '.non_chimera.fa', 'w')
    else:
        f_chimera = open(fasta_filename + '.is_chimera.fq', 'w')
        f_nonchimera = open(fasta_filename + '.non_chimera.fq', 'w')
    chimera_ids = [sid for (sid, dom_records) in suspicious_hits.iteritems() if is_chimera(dom_records)]
    count_chimera, count_nonchimera = 0, 0
    for r in smart_fa_fq_reader(fasta_filename):
        if r.id in chimera_ids:
            SeqIO.write(r, f_chimera, 'fasta' if not output_fq else 'fastq')
            count_chimera += 1
        else:
            SeqIO.write(r, f_nonchimera, 'fasta' if not output_fq else 'fastq')
            count_nonchimera += 1            
    f_chimera.close()
    f_nonchimera.close()
    print >> sys.stderr, "Number of chimera-to-non-chimera: {0}/{1}".format(count_chimera, count_nonchimera)
예제 #3
0
def chimera_finder_main(output_dir, primer_filename, fasta_filename, hmmer_out_filename='hmmer_for_chimera.out', min_dist_from_end=100, max_adjacent_hit_distance=50, cpus=8, min_score=10, output_fq=False):
    """
    (1) run HMMER on the split input
    (2) parse HMMER output
    (3) split input into chimeric and non-chimeric
    """
    # find the matrix file PBMATRIX.txt
    matrix_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'PBMATRIX.txt')
    if not os.path.exists(matrix_filename):
        print >> sys.stderr, "Expected matrix file {0} but not found. Abort!".format(matrix_filename)
        sys.exit(-1)
    
    out_filename_hmmer = os.path.join(output_dir, hmmer_out_filename)
    if os.path.exists(output_dir):
        if os.path.exists(out_filename_hmmer):
            print >> sys.stderr, "output directory {0} already exists. Running just the primer trimming part.".format(output_dir)
            p_indices = []
            for r in smart_fa_fq_reader(os.path.join(output_dir, primer_filename)):
                if r.id[0] == 'F':
                    p_indices.append(r.id[1:])
        else:
            print >> sys.stderr, "output directory {0} already exists. Abort.".format(output_dir)
            sys.exit(-1)
    else:
        print >> sys.stderr, "checking for phmmer existence..."
        sanity_check_phmmer()
        print >> sys.stderr, "creating output directory {0}....".format(output_dir)
        os.makedirs(output_dir)

        print >> sys.stderr, "checking and copying primer file", primer_filename
        p_filename = os.path.join(output_dir, os.path.basename(primer_filename))
        p_indices = sanity_check_primers_for_chimera(primer_filename, p_filename)

        print >> sys.stderr, "splitting into chunks", fasta_filename
        i = 0
        size = int(os.popen("grep -c \">\" " + fasta_filename).read()) / cpus + 1
        count = 0
        jobs = []
        f_in = open(os.path.join(output_dir, 'in.fa_split'+str(i)), 'w')
        for r in smart_fa_fq_reader(fasta_filename):
            f_in.write(">{0}\n{1}\n".format(r.id, r.seq))
            count += 1
            if count > size:
                f_in.close()
                p = multiprocessing.Process(target=worker, args=(out_filename_hmmer+'_split'+str(i), p_filename, f_in.name, matrix_filename))
                jobs.append((p, out_filename_hmmer+'_split'+str(i)))
                p.start()
                i += 1
                count = 0
                f_in = open(os.path.join(output_dir, 'in.fa_split'+str(i)), 'w')
        f_in.close()
        if count > 0:
            p = multiprocessing.Process(target=worker, args=(out_filename_hmmer+'_split'+str(i), p_filename, f_in.name, matrix_filename))
            jobs.append((p, out_filename_hmmer+'_split'+str(i)))
            p.start()

        for p, out in jobs:
            p.join()
            subprocess.check_call("cat {0} >> {1}".format(out, out_filename_hmmer), shell=True)


    suspicious_hits = parse_hmmer_dom_for_chimera(out_filename_hmmer, min_score, min_dist_from_end)
    remove_chimeras(fasta_filename, suspicious_hits, max_adjacent_hit_distance, output_fq)
    
    print >> sys.stderr, "Cleaning split files"
    subprocess.check_call("rm -rf {0}/*split*".format(output_dir), shell=True)