Exemplo n.º 1
0
def calculate_distance_distribution(interleave_fhand, index_fpath,
                                    max_clipping, max_distance=None,
                                    tempdir=None, threads=None):
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath, interleave_fpath=interleave_fhand.name,
                          extra_params=extra_params)
    map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname',
                             tempdir=tempdir)
    bamfile = Samfile(bam_fhand.name)
    stats = {'outies': IntCounter(), 'innies': IntCounter(),
             'others': IntCounter()}
    for grouped_mates in _group_alignments_reads_by_qname(bamfile):
        mates = _split_mates(grouped_mates)
        for aligned_read1 in _get_totally_mapped_alignments(mates[0],
                                                            max_clipping):
            for aligned_read2 in _get_totally_mapped_alignments(mates[1],
                                                                max_clipping):
                if aligned_read1.rname == aligned_read2.rname:
                    aligned_reads = [aligned_read1, aligned_read2]
                    distance = _find_distance(aligned_reads)
                    if _mates_are_outies(aligned_reads):
                        kind = 'outies'
                    elif _mates_are_innies(aligned_reads):
                        kind = 'innies'
                    else:
                        kind = 'others'
                    if max_distance is None or max_distance > distance:
                        stats[kind][distance] += 1
    return stats
Exemplo n.º 2
0
    def test_classify_paired_reads(self):
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')
        # Non chimeric
        query1 = '>seq1 1:N:0:GATCAG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT\n'
        query2 = '>seq1 2:N:0:GATCAG\nAGGAGGGATCGGGCACCCACGGCGCGGTAGACTGAGGCCTTCTCGAACT\n'
        # Chimeric
        query3 = '>seq2 1:N:0:GATCAG\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC\n'
        query4 = '>seq2 2:N:0:GATCAG\nACGTGGATGCGGCGACGGCCCTACGGCACATACTGTTATTAGGGTCACT\n'
        # unknown
        query5 = '>seq3 1:N:0:GATCAG\nAGTGACCCTAATAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGT\n'
        query6 = '>seq3 2:N:0:GATCAG\nGTCGTGCGCAGCCATTGAGACCTTCCTAGGGTTTTCCCCATGGAATCGG\n'

        query = query1 + query2 + query5 + query6 + query3 + query4
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        bam_fhand = NamedTemporaryFile(suffix='.bam')
        extra_params = ['-a', '-M']
        bwa = map_with_bwamem(index_fpath,
                              interleave_fpath=in_fhand.name,
                              extra_params=extra_params)
        map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname')
        result = classify_mapped_reads(bam_fhand, mate_distance=2000)
        for pair, kind in result:
            if kind == NON_CHIMERIC:
                assert 'seq1' in get_name(pair[0])
            elif kind == UNKNOWN:
                assert 'seq3' in get_name(pair[0])
            elif kind == CHIMERA:
                assert 'seq2' in get_name(pair[0])
            else:
                self.fail()
Exemplo n.º 3
0
def classify_chimeras(in_fhand,
                      index_fpath,
                      mate_distance,
                      out_fhand,
                      chimeras_fhand=None,
                      unknown_fhand=None,
                      tempdir=None,
                      threads=None,
                      settings=get_setting('CHIMERAS_SETTINGS')):
    '''It maps sequences from input files, sorts them and writes to output
    files according to its classification'''
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath,
                          interleave_fpath=in_fhand.name,
                          extra_params=extra_params)
    map_process_to_sortedbam(bwa,
                             bam_fhand.name,
                             key='queryname',
                             tempdir=tempdir)

    for pair, kind in classify_mapped_reads(bam_fhand,
                                            settings=settings,
                                            mate_distance=mate_distance):
        if kind is NON_CHIMERIC:
            write_seqs(pair, out_fhand)
        elif kind is CHIMERA and chimeras_fhand is not None:
            write_seqs(pair, chimeras_fhand)
        elif kind is UNKNOWN and unknown_fhand is not None:
            write_seqs(pair, unknown_fhand)
Exemplo n.º 4
0
    def test_classify_paired_reads(self):
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')
        #Non chimeric
        query1 = '>seq1 1:N:0:GATCAG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT\n'
        query2 = '>seq1 2:N:0:GATCAG\nAGGAGGGATCGGGCACCCACGGCGCGGTAGACTGAGGCCTTCTCGAACT\n'
        #Chimeric
        query3 = '>seq2 1:N:0:GATCAG\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC\n'
        query4 = '>seq2 2:N:0:GATCAG\nACGTGGATGCGGCGACGGCCCTACGGCACATACTGTTATTAGGGTCACT\n'
        #unknown
        query5 = '>seq3 1:N:0:GATCAG\nAGTGACCCTAATAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGT\n'
        query6 = '>seq3 2:N:0:GATCAG\nGTCGTGCGCAGCCATTGAGACCTTCCTAGGGTTTTCCCCATGGAATCGG\n'

        query = query1 + query2 + query5 + query6 + query3 + query4
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        bam_fhand = NamedTemporaryFile(suffix='.bam')
        extra_params = ['-a', '-M']
        bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name,
                              extra_params=extra_params)
        map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname')
        result = classify_mapped_reads(bam_fhand, mate_distance=2000)
        for pair, kind in result:
            if kind == NON_CHIMERIC:
                assert 'seq1' in get_name(pair[0])
            elif kind == UNKNOWN:
                assert 'seq3' in get_name(pair[0])
            elif kind == CHIMERA:
                assert 'seq2' in get_name(pair[0])
            else:
                self.fail()
Exemplo n.º 5
0
    def _pre_trim(self, trim_packet):
        seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs]
        reads_fhand = NamedTemporaryFile(dir=self._tempdir, suffix='.trimming')

        write_seqs(seqs, reads_fhand)
        reads_fhand.flush()
        bwa = map_with_bwamem(self._index_fpath,
                              interleave_fpath=reads_fhand.name)
        bam_fhand = NamedTemporaryFile(dir=self._tempdir)
        map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname',
                                 tempdir=self._tempdir)

        self._bam_fhand = bam_fhand
        reads_fhand.close()
Exemplo n.º 6
0
    def _pre_trim(self, trim_packet):
        seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs]
        reads_fhand = NamedTemporaryFile(dir=self._tempdir, suffix='.trimming')

        write_seqs(seqs, reads_fhand)
        reads_fhand.flush()
        bwa = map_with_bwamem(self._index_fpath,
                              interleave_fpath=reads_fhand.name)
        bam_fhand = NamedTemporaryFile(dir=self._tempdir)
        map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname',
                                 tempdir=self._tempdir)

        self._bam_fhand = bam_fhand
        reads_fhand.close()
Exemplo n.º 7
0
def calculate_distance_distribution(interleave_fhand,
                                    index_fpath,
                                    max_clipping,
                                    max_distance=None,
                                    tempdir=None,
                                    threads=None):
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath,
                          interleave_fpath=interleave_fhand.name,
                          extra_params=extra_params,
                          threads=threads)
    map_process_to_sortedbam(bwa,
                             bam_fhand.name,
                             key='queryname',
                             tempdir=tempdir)
    return calculate_distance_distribution_in_bam(bam_fhand,
                                                  max_clipping=max_clipping,
                                                  max_distance=max_distance)
Exemplo n.º 8
0
def classify_chimeras(in_fhand, index_fpath, mate_distance, out_fhand,
                      chimeras_fhand=None, unknown_fhand=None, tempdir=None,
                      threads=None, settings=get_setting('CHIMERAS_SETTINGS')):

    '''It maps sequences from input files, sorts them and writes to output
    files according to its classification'''
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name,
                          extra_params=extra_params)
    map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname',
                             tempdir=tempdir)

    for pair, kind in classify_mapped_reads(bam_fhand, settings=settings,
                                            mate_distance=mate_distance):
        if kind is NON_CHIMERIC:
            write_seqs(pair, out_fhand)
        elif kind is CHIMERA and chimeras_fhand is not None:
            write_seqs(pair, chimeras_fhand)
        elif kind is UNKNOWN and unknown_fhand is not None:
            write_seqs(pair, unknown_fhand)
Exemplo n.º 9
0
def calculate_distance_distribution(interleave_fhand,
                                    index_fpath,
                                    max_clipping,
                                    max_distance=None,
                                    tempdir=None,
                                    threads=None):
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath,
                          interleave_fpath=interleave_fhand.name,
                          extra_params=extra_params,
                          threads=threads)
    map_process_to_sortedbam(bwa,
                             bam_fhand.name,
                             key='queryname',
                             tempdir=tempdir)
    bamfile = AlignmentFile(bam_fhand.name)
    stats = {
        'outies': IntCounter(),
        'innies': IntCounter(),
        'others': IntCounter()
    }
    for grouped_mates in _group_alignments_reads_by_qname(bamfile):
        mates = _split_mates(grouped_mates)
        for aligned_read1 in _get_totally_mapped_alignments(
                mates[0], max_clipping):
            for aligned_read2 in _get_totally_mapped_alignments(
                    mates[1], max_clipping):
                if aligned_read1.rname == aligned_read2.rname:
                    aligned_reads = [aligned_read1, aligned_read2]
                    distance = _find_distance(aligned_reads)
                    if _mates_are_outies(aligned_reads):
                        kind = 'outies'
                    elif _mates_are_innies(aligned_reads):
                        kind = 'innies'
                    else:
                        kind = 'others'
                    if max_distance is None or max_distance > distance:
                        stats[kind][distance] += 1
    return stats