예제 #1
0
    def test_get_read_group_info():
        'Tests get_read_group_info'
        sam_sample = '''@SQ\tSN:SGN-U576692\tLN:1714
@SQ\tSN:SGN-U572743\tLN:833
@RG\tID:g1\tLB:g1\tSM:g1\tPL:sanger
@RG\tID:g3\tLB:g3\tSM:g3\tPL:sanger
SGN-E200000\t0\tSGN-U572743\t317\t226\t14M\t*\t0\t0\tGGATGATKTTAGAG\t*\tAS:i:250\tXS:i:0\tXF:i:0\tXE:i:7\tXN:i:0\tRG:Z:g1
SGN-E40000\t0\tSGN-U576692\t1416\t207\t10M\t*\t0\t0\tAGCCTGATAA\t,,09377777\tAS:i:160\tXS:i:0\tXF:i:3\tXE:i:4\tXN:i:0\tRG:Z:g3
SGN-E40000\t20\tSGN-U576692\t1416\t207\t10M\t*\t0\t0\tAGCCTGATAA\t,,09377777\tAS:i:160\tXS:i:0\tXF:i:3\tXE:i:4\tXN:i:0\tRG:Z:g3
'''
        sam_fhand = NamedTemporaryFile(suffix='.sam')
        sam_fhand.write(sam_sample)
        sam_fhand.flush()
        bam_fhand = NamedTemporaryFile(suffix='.bam')
        sam2bam(sam_fhand.name, bam_fhand.name)
        bam_fhand.flush()
        bam = pysam.Samfile(bam_fhand.name, 'rb')
        read_gro_i = get_read_group_info(bam)
        assert read_gro_i == {'g3': {'LB': 'g3', 'SM': 'g3', 'PL': 'sanger'},
                              'g1': {'LB': 'g1', 'SM': 'g1', 'PL': 'sanger'}}
예제 #2
0
def _snvs_in_bam(bam, reference, min_quality, default_sanger_quality,
                 min_mapq, min_num_alleles, max_maf, min_num_reads_for_allele,
                 read_edge_conf=None, default_bam_platform=None):
    'It yields the snv information for every snv in the given reference'

    min_num_alleles = int(min_num_alleles)

    read_groups_info = get_read_group_info(bam)
    if not read_groups_info:
        if default_bam_platform is None:
            msg = 'Platform is not present either in header or in '
            msg += 'configuration'
            raise ValueError(msg)
        read_groups_info = {UNKNOWN_RG:{'PL':default_bam_platform}}

    reference_id = get_seq_name(reference)
    reference_seq = reference.seq
    reference_len = len(reference_seq)
    #we can clean the cache of segments because we're in a new molecule
    global SEGMENTS_CACHE
    SEGMENTS_CACHE = {}
    for column in bam.pileup(reference=reference_id):
        alleles = {}
        ref_pos = column.pos
        if ref_pos >= reference_len:
            continue
        ref_id = bam.getrname(column.tid)
        ref_allele = reference_seq[ref_pos].upper()
        for pileup_read in column.pileups:
            #for each read in the column we add its allele to the alleles dict
            aligned_read = pileup_read.alignment

            read_mapping_qual = aligned_read.mapq
            #We ignore the reads that are likely to be missaligned
            if read_mapping_qual < min_mapq:
                continue

            try:
                read_group = aligned_read.opt('RG')
            except KeyError:
                read_group = UNKNOWN_RG

            read_name = aligned_read.qname
            if read_groups_info and read_group in read_groups_info:
                platform = read_groups_info[read_group]['PL']
            else:
                platform = default_bam_platform

            read_pos = pileup_read.qpos

            alleles_here, read_limits = _get_alleles_from_read(ref_allele,
                                                               ref_pos,
                                                               pileup_read)

            if read_edge_conf and platform in read_edge_conf:
                edge_left, edge_right = read_edge_conf[platform]

                #if we're in the edge region to be ignored we continue to
                #the next read, because there's no allele to add for this one.

                if (edge_left is not None and
                    read_limits[0] + edge_left > read_pos):
                    continue
                if (edge_right is not None and
                    read_pos > read_limits[1] - edge_right):
                    continue

            for allele in alleles_here:
                allele, kind, qual, is_reverse = allele
                _add_allele(alleles, allele, kind, read_name, read_group,
                    is_reverse, qual, read_mapping_qual,
                    read_groups_info)

        #remove N
        _remove_alleles_n(alleles)

        #add default sanger qualities to the sanger reads with no quality
        _add_default_sanger_quality(alleles, default_sanger_quality,
                                    read_groups_info)

        #remove bad quality alleles
        _remove_bad_quality_alleles(alleles, min_quality)

        #check maf
        if not check_maf_ok(alleles, max_maf):
            continue

        # min_num_reads_for_allele
        _remove_alleles_by_read_number(alleles, min_num_reads_for_allele)

        #if there are a min_num number of alleles requested and there are more
        #alleles than that
        #OR
        #there is some allele different than invariant
        #a variation is yield
        if not alleles:
            continue
        if (len(alleles) > min_num_alleles or
            (min_num_alleles == 1 and alleles.keys()[0][1] != INVARIANT) or
            (min_num_alleles > 1 and len(alleles) >= min_num_alleles)):
            yield {'ref_name':ref_id,
                   'ref_position':ref_pos,
                   'reference_allele':ref_allele,
                   'alleles':alleles,
                   'read_groups':read_groups_info}