Пример #1
0
def calculate_distance_distribution(interleave_fhand, index_fpath,
                                    max_clipping, max_distance=None,
                                    tempdir=None, threads=None):
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath, interleave_fpath=interleave_fhand.name,
                          extra_params=extra_params)
    map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname',
                             tempdir=tempdir)
    bamfile = Samfile(bam_fhand.name)
    stats = {'outies': IntCounter(), 'innies': IntCounter(),
             'others': IntCounter()}
    for grouped_mates in _group_alignments_reads_by_qname(bamfile):
        mates = _split_mates(grouped_mates)
        for aligned_read1 in _get_totally_mapped_alignments(mates[0],
                                                            max_clipping):
            for aligned_read2 in _get_totally_mapped_alignments(mates[1],
                                                                max_clipping):
                if aligned_read1.rname == aligned_read2.rname:
                    aligned_reads = [aligned_read1, aligned_read2]
                    distance = _find_distance(aligned_reads)
                    if _mates_are_outies(aligned_reads):
                        kind = 'outies'
                    elif _mates_are_innies(aligned_reads):
                        kind = 'innies'
                    else:
                        kind = 'others'
                    if max_distance is None or max_distance > distance:
                        stats[kind][distance] += 1
    return stats
Пример #2
0
    def test_add_rg_to_bam(self):
        reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes')
        reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq')
        directory = TemporaryDir()
        index_fpath = get_or_create_bwa_index(reference_fpath, directory.name)
        bam_fhand = NamedTemporaryFile(suffix='.bam')
        lib_name = 'aa'
        log_fhand = NamedTemporaryFile()
        readgroup = {
            'ID': lib_name,
            'PL': 'illumina',
            'LB': lib_name,
            'SM': '{0}_illumina_pe'.format(lib_name),
            'PU': '0'
        }
        bwa = map_with_bwamem(index_fpath,
                              unpaired_fpath=reads_fpath,
                              readgroup=readgroup,
                              log_fpath=log_fhand.name)
        map_process_to_bam(bwa, bam_fhand.name)
        out = subprocess.check_output(
            [get_binary_path('samtools'), 'view', '-h', bam_fhand.name],
            stderr=log_fhand)
        assert '@RG\tID:aa' in out
        assert 'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out

        directory.close()
Пример #3
0
    def test_classify_paired_reads(self):
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')
        # Non chimeric
        query1 = '>seq1 1:N:0:GATCAG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT\n'
        query2 = '>seq1 2:N:0:GATCAG\nAGGAGGGATCGGGCACCCACGGCGCGGTAGACTGAGGCCTTCTCGAACT\n'
        # Chimeric
        query3 = '>seq2 1:N:0:GATCAG\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC\n'
        query4 = '>seq2 2:N:0:GATCAG\nACGTGGATGCGGCGACGGCCCTACGGCACATACTGTTATTAGGGTCACT\n'
        # unknown
        query5 = '>seq3 1:N:0:GATCAG\nAGTGACCCTAATAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGT\n'
        query6 = '>seq3 2:N:0:GATCAG\nGTCGTGCGCAGCCATTGAGACCTTCCTAGGGTTTTCCCCATGGAATCGG\n'

        query = query1 + query2 + query5 + query6 + query3 + query4
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        bam_fhand = NamedTemporaryFile(suffix='.bam')
        extra_params = ['-a', '-M']
        bwa = map_with_bwamem(index_fpath,
                              interleave_fpath=in_fhand.name,
                              extra_params=extra_params)
        map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname')
        result = classify_mapped_reads(bam_fhand, mate_distance=2000)
        for pair, kind in result:
            if kind == NON_CHIMERIC:
                assert 'seq1' in get_name(pair[0])
            elif kind == UNKNOWN:
                assert 'seq3' in get_name(pair[0])
            elif kind == CHIMERA:
                assert 'seq2' in get_name(pair[0])
            else:
                self.fail()
Пример #4
0
def classify_chimeras(in_fhand,
                      index_fpath,
                      mate_distance,
                      out_fhand,
                      chimeras_fhand=None,
                      unknown_fhand=None,
                      tempdir=None,
                      threads=None,
                      settings=get_setting('CHIMERAS_SETTINGS')):
    '''It maps sequences from input files, sorts them and writes to output
    files according to its classification'''
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath,
                          interleave_fpath=in_fhand.name,
                          extra_params=extra_params)
    map_process_to_sortedbam(bwa,
                             bam_fhand.name,
                             key='queryname',
                             tempdir=tempdir)

    for pair, kind in classify_mapped_reads(bam_fhand,
                                            settings=settings,
                                            mate_distance=mate_distance):
        if kind is NON_CHIMERIC:
            write_seqs(pair, out_fhand)
        elif kind is CHIMERA and chimeras_fhand is not None:
            write_seqs(pair, chimeras_fhand)
        elif kind is UNKNOWN and unknown_fhand is not None:
            write_seqs(pair, unknown_fhand)
Пример #5
0
    def test_classify_paired_reads(self):
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')
        #Non chimeric
        query1 = '>seq1 1:N:0:GATCAG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT\n'
        query2 = '>seq1 2:N:0:GATCAG\nAGGAGGGATCGGGCACCCACGGCGCGGTAGACTGAGGCCTTCTCGAACT\n'
        #Chimeric
        query3 = '>seq2 1:N:0:GATCAG\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC\n'
        query4 = '>seq2 2:N:0:GATCAG\nACGTGGATGCGGCGACGGCCCTACGGCACATACTGTTATTAGGGTCACT\n'
        #unknown
        query5 = '>seq3 1:N:0:GATCAG\nAGTGACCCTAATAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGT\n'
        query6 = '>seq3 2:N:0:GATCAG\nGTCGTGCGCAGCCATTGAGACCTTCCTAGGGTTTTCCCCATGGAATCGG\n'

        query = query1 + query2 + query5 + query6 + query3 + query4
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        bam_fhand = NamedTemporaryFile(suffix='.bam')
        extra_params = ['-a', '-M']
        bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name,
                              extra_params=extra_params)
        map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname')
        result = classify_mapped_reads(bam_fhand, mate_distance=2000)
        for pair, kind in result:
            if kind == NON_CHIMERIC:
                assert 'seq1' in get_name(pair[0])
            elif kind == UNKNOWN:
                assert 'seq3' in get_name(pair[0])
            elif kind == CHIMERA:
                assert 'seq2' in get_name(pair[0])
            else:
                self.fail()
Пример #6
0
    def test_map_with_bwa(self):
        reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes')
        reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq')
        directory = TemporaryDir()
        index_fpath = get_or_create_bwa_index(reference_fpath, directory.name)
        bam_fhand = NamedTemporaryFile(suffix='.bam')
        bwa = map_with_bwamem(index_fpath, unpaired_fpath=reads_fpath)
        map_process_to_bam(bwa, bam_fhand.name)
        out = subprocess.check_output([get_binary_path('samtools'), 'view',
                                       bam_fhand.name])
        assert  'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out

        directory.close()
Пример #7
0
    def test_map_with_bwa(self):
        reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes')
        reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq')
        directory = TemporaryDir()
        index_fpath = get_or_create_bwa_index(reference_fpath, directory.name)
        bam_fhand = NamedTemporaryFile(suffix='.bam')
        bwa = map_with_bwamem(index_fpath, unpaired_fpath=reads_fpath)
        map_process_to_bam(bwa, bam_fhand.name)
        out = subprocess.check_output(
            [get_binary_path('samtools'), 'view', bam_fhand.name])
        assert 'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out

        directory.close()
Пример #8
0
    def _pre_trim(self, trim_packet):
        seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs]
        reads_fhand = NamedTemporaryFile(dir=self._tempdir, suffix='.trimming')

        write_seqs(seqs, reads_fhand)
        reads_fhand.flush()
        bwa = map_with_bwamem(self._index_fpath,
                              interleave_fpath=reads_fhand.name)
        bam_fhand = NamedTemporaryFile(dir=self._tempdir)
        map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname',
                                 tempdir=self._tempdir)

        self._bam_fhand = bam_fhand
        reads_fhand.close()
Пример #9
0
    def _pre_trim(self, trim_packet):
        seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs]
        reads_fhand = NamedTemporaryFile(dir=self._tempdir, suffix='.trimming')

        write_seqs(seqs, reads_fhand)
        reads_fhand.flush()
        bwa = map_with_bwamem(self._index_fpath,
                              interleave_fpath=reads_fhand.name)
        bam_fhand = NamedTemporaryFile(dir=self._tempdir)
        map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname',
                                 tempdir=self._tempdir)

        self._bam_fhand = bam_fhand
        reads_fhand.close()
Пример #10
0
    def test_add_rg_to_bam(self):
        reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes')
        reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq')
        directory = TemporaryDir()
        index_fpath = get_or_create_bwa_index(reference_fpath, directory.name)
        bam_fhand = NamedTemporaryFile(suffix='.bam')
        lib_name = 'aa'
        log_fhand = NamedTemporaryFile()
        readgroup = {'ID': lib_name, 'PL': 'illumina', 'LB': lib_name,
                     'SM': '{0}_illumina_pe'.format(lib_name), 'PU': '0'}
        bwa = map_with_bwamem(index_fpath, unpaired_fpath=reads_fpath,
                              readgroup=readgroup, log_fpath=log_fhand.name)
        map_process_to_bam(bwa, bam_fhand.name)
        out = subprocess.check_output([get_binary_path('samtools'), 'view',
                                       '-h', bam_fhand.name], stderr=log_fhand)
        assert '@RG\tID:aa' in out
        assert 'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out

        directory.close()
Пример #11
0
def _sorted_mapped_reads(ref_fpath, paired_fpaths=None,
                     unpaired_fpaths=None, directory=None,
                     file_format=None, min_seed_len=None):
    fhand = open(paired_fpaths[0]) if paired_fpaths else open(unpaired_fpaths[0])
    if file_format is not None:
        set_format(fhand, file_format)
    else:
        file_format = get_format(fhand)
    index_fpath = get_or_create_bwa_index(ref_fpath, directory)
    extra_params = ['-a', '-M']
    if min_seed_len is not None:
        extra_params.extend(['-k', min_seed_len])
    bwa = map_with_bwamem(index_fpath, paired_fpaths=paired_fpaths,
                         unpaired_fpath=unpaired_fpaths,
                         extra_params=extra_params)
    bam_fhand = NamedTemporaryFile(dir='/home/carlos/tmp')
    sort_mapped_reads(bwa, bam_fhand.name, key='queryname')
    bamfile = pysam.Samfile(bam_fhand.name)
    return bamfile
Пример #12
0
def calculate_distance_distribution(interleave_fhand,
                                    index_fpath,
                                    max_clipping,
                                    max_distance=None,
                                    tempdir=None,
                                    threads=None):
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath,
                          interleave_fpath=interleave_fhand.name,
                          extra_params=extra_params,
                          threads=threads)
    map_process_to_sortedbam(bwa,
                             bam_fhand.name,
                             key='queryname',
                             tempdir=tempdir)
    return calculate_distance_distribution_in_bam(bam_fhand,
                                                  max_clipping=max_clipping,
                                                  max_distance=max_distance)
Пример #13
0
def classify_chimeras(in_fhand, index_fpath, mate_distance, out_fhand,
                      chimeras_fhand=None, unknown_fhand=None, tempdir=None,
                      threads=None, settings=get_setting('CHIMERAS_SETTINGS')):

    '''It maps sequences from input files, sorts them and writes to output
    files according to its classification'''
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name,
                          extra_params=extra_params)
    map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname',
                             tempdir=tempdir)

    for pair, kind in classify_mapped_reads(bam_fhand, settings=settings,
                                            mate_distance=mate_distance):
        if kind is NON_CHIMERIC:
            write_seqs(pair, out_fhand)
        elif kind is CHIMERA and chimeras_fhand is not None:
            write_seqs(pair, chimeras_fhand)
        elif kind is UNKNOWN and unknown_fhand is not None:
            write_seqs(pair, unknown_fhand)
Пример #14
0
    def test_rev_compl_fragmented_reads(self):
        reference_seq = GENOME

        #with paired_reads.
        #f is reversed r is direct
        query1 = '>seq10 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT'
        query1 += '\n'
        query2 = '>seq10 r\nATGTAATACGGGCTAGCCGGGGATGCCGACGATTAAACACGCTGTCATA'
        query2 += 'GTAGCGTCTTCCTAGGGTTTTCCCCATGGAATCGGTTATCGTGATACGTTAAATTT\n'
        #f is direct, r is reversed
        query3 = '>seq11 f\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC'
        query3 += '\n'
        query4 = '>seq11 r\nAAATTTAACGTATCACGATAACCGATTCCATGGGGAAAACCCTAGGAAG'
        query4 += 'ACGCTACTATGACAGCGTGTTTAATCGTCGGCATCCCCGGCTAGCCCGTATTACAT\n'

        #f is fragmented in two reference sequences. r mapps completely
        query7 = '>seq4 f\nCAAATCATCACCAGACCATGTCCGATCCCGGGAGTCTTTTCCAAGGTGTGC'
        query7 += 'TCTTTATCCGGCCCTTGCTCAAGGGTATGTTAAAACGGCAAGAGCTGCCTGAGCGCG\n'
        query8 = '>seq4 r\nTGTTCTGCAATCGATACAACGATCGAATTTAATCTGAGTAACTGCCAATTC'
        query8 += 'TGAGTAATATTATAGAAAGT\n'

        query_f = query1 + query3 + query7
        query_r = query2 + query4 + query8

        f_fhand = NamedTemporaryFile()
        f_fhand.write(query_f)
        f_fhand.flush()
        r_fhand = NamedTemporaryFile()
        r_fhand.write(query_r)
        r_fhand.flush()
        paired_fpaths = [f_fhand.name, r_fhand.name]
        ref_fhand = NamedTemporaryFile()
        ref_fhand.write(reference_seq)
        ref_fhand.flush()

        index_fpath = get_or_create_bwa_index(ref_fhand.name)
        bam_fhand = NamedTemporaryFile(suffix='.bam')
        bwa = map_with_bwamem(index_fpath, paired_fpaths=paired_fpaths)
        map_process_to_bam(bwa, bam_fhand.name)
        samfile = pysam.Samfile(bam_fhand.name)
Пример #15
0
def calculate_distance_distribution(interleave_fhand,
                                    index_fpath,
                                    max_clipping,
                                    max_distance=None,
                                    tempdir=None,
                                    threads=None):
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath,
                          interleave_fpath=interleave_fhand.name,
                          extra_params=extra_params,
                          threads=threads)
    map_process_to_sortedbam(bwa,
                             bam_fhand.name,
                             key='queryname',
                             tempdir=tempdir)
    bamfile = AlignmentFile(bam_fhand.name)
    stats = {
        'outies': IntCounter(),
        'innies': IntCounter(),
        'others': IntCounter()
    }
    for grouped_mates in _group_alignments_reads_by_qname(bamfile):
        mates = _split_mates(grouped_mates)
        for aligned_read1 in _get_totally_mapped_alignments(
                mates[0], max_clipping):
            for aligned_read2 in _get_totally_mapped_alignments(
                    mates[1], max_clipping):
                if aligned_read1.rname == aligned_read2.rname:
                    aligned_reads = [aligned_read1, aligned_read2]
                    distance = _find_distance(aligned_reads)
                    if _mates_are_outies(aligned_reads):
                        kind = 'outies'
                    elif _mates_are_innies(aligned_reads):
                        kind = 'innies'
                    else:
                        kind = 'others'
                    if max_distance is None or max_distance > distance:
                        stats[kind][distance] += 1
    return stats
Пример #16
0
    def test_rev_compl_fragmented_reads(self):
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')

        # with paired_reads.
        # f is reversed r is direct
        query1 = '>seq10 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT'
        query1 += '\n'
        query2 = '>seq10 r\nATGTAATACGGGCTAGCCGGGGATGCCGACGATTAAACACGCTGTCATA'
        query2 += 'GTAGCGTCTTCCTAGGGTTTTCCCCATGGAATCGGTTATCGTGATACGTTAAATTT\n'
        # f is direct, r is reversed
        query3 = '>seq11 f\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC'
        query3 += '\n'
        query4 = '>seq11 r\nAAATTTAACGTATCACGATAACCGATTCCATGGGGAAAACCCTAGGAAG'
        query4 += 'ACGCTACTATGACAGCGTGTTTAATCGTCGGCATCCCCGGCTAGCCCGTATTACAT\n'

        # f is fragmented in two reference sequences. r mapps completely
        query7 = '>seq4 f\nCAAATCATCACCAGACCATGTCCGATCCCGGGAGTCTTTTCCAAGGTGTGC'
        query7 += 'TCTTTATCCGGCCCTTGCTCAAGGGTATGTTAAAACGGCAAGAGCTGCCTGAGCGCG\n'
        query8 = '>seq4 r\nTGTTCTGCAATCGATACAACGATCGAATTTAATCTGAGTAACTGCCAATTC'
        query8 += 'TGAGTAATATTATAGAAAGT\n'

        query_f = query1 + query3 + query7
        query_r = query2 + query4 + query8

        f_fhand = NamedTemporaryFile()
        f_fhand.write(query_f)
        f_fhand.flush()
        r_fhand = NamedTemporaryFile()
        r_fhand.write(query_r)
        r_fhand.flush()
        paired_fpaths = (f_fhand.name, r_fhand.name)

        bam_fhand = NamedTemporaryFile(suffix='.bam')
        bwa = map_with_bwamem(index_fpath, paired_fpaths=paired_fpaths)
        map_process_to_bam(bwa, bam_fhand.name)
        samfile = pysam.Samfile(bam_fhand.name)