Exemplo n.º 1
0
def find_variations_md(cigartuples, chromosome, pos, qual, seq, md, subsegments):
    """
    Loops through read to find variant base. This function is used with BWA-MEM, Minimap2 and NGMLR bam files
    :param cigartuples:
    :param chromosome:
    :param pos:
    :param qual:
    :param seq:
    :param md:
    :param subsegments:
    """
    global tmp_variants
    md_tag = re.split("\^\D+", md)
    read_cursor = 0
    ref_cursor = 0
    md_cursor = 0
    md_string = ''
    for tuple in cigartuples:
        if tuple[0] == 4:
            read_cursor += tuple[1]
        if tuple[0] == 1:
            read_cursor += tuple[1]
        if tuple[0] == 2:
            md_cursor += 1
            md_string = ''
            for deletion in range(tuple[1]):
                if (pos + 1 + ref_cursor) not in tmp_variants:
                    tmp_variants[pos + 1 + ref_cursor] = v.Variant(chromosome, pos + 1 + ref_cursor)
                for segment in subsegments:
                    if segment.pos <= (pos + 1 + ref_cursor) <= segment.end:
                        tmp_variants[pos + 1 + ref_cursor].add_segment(segment.id, ['-'])
                        break
                    if segment.end < ref_cursor:
                        del subsegments[subsegments.index(segment)]
                ref_cursor += 1
        if tuple[0] == 0:
            if re.search("\D+", md_tag[md_cursor]):
                if md_string == '':
                    for m in re.split("\D", md_tag[md_cursor]):
                        if md_string != '':
                            md_string += 'X'
                        md_string += "=" * int(m)
                for seq_mismatch in re.finditer('X', md_string[:tuple[1]]):
                    if (pos + 1 + ref_cursor) not in tmp_variants:
                        tmp_variants[pos + 1 + ref_cursor] = v.Variant(chromosome, pos + 1 + ref_cursor)
                    for segment in subsegments:
                        if segment.pos <= (pos + 1 + ref_cursor) <= segment.end:
                            tmp_variants[pos + 1 + ref_cursor].add_segment(segment.id, [seq[read_cursor + seq_mismatch.start()], qual[read_cursor + seq_mismatch.start()]])
                            break
                        if segment.end < ref_cursor:
                            del subsegments[subsegments.index(segment)]
                md_string = md_string[tuple[1]:]
            ref_cursor += tuple[1]
            read_cursor += tuple[1]
Exemplo n.º 2
0
def find_SNPs(chromosome, snp_position):
    """
    Looks for SNPs on given position in the genome using pileup.
    :param chromosome:
    :param snp_position:
    """
    base_ratios = {'A': [0, 0], 'C': [0, 0], 'G': [0, 0], 'T': [0, 0], '=': [0, 0]}
    deletions = 0
    total_n = 0
    variant = v.Variant(chromosome, int(snp_position))
    for pileupcolumn in F.pileup(chromosome, int(snp_position)-1, int(snp_position), truncate=True):
        for pileupread in pileupcolumn.pileups:
            if not keep_segment(pileupread.alignment, pileupread.alignment.query_alignment_length):
                continue
            clip, clip_2 = calculate_clip(pileupread.alignment)
            if pileupread.is_del:
                variant.add_segment([pileupread.alignment.reference_name, pileupread.alignment.reference_start, str(pileupread.alignment.query_name) + ";" + str(clip)], '-')
                deletions += 1
                total_n += 1
            if not pileupread.is_del and not pileupread.is_refskip:
                if pileupread.alignment.query_qualities[pileupread.query_position] >= NanoSV.opts_min_base_qual_ph:
                    base_ratios[pileupread.alignment.query_sequence[pileupread.query_position]][0] += 1
                else:
                    base_ratios[pileupread.alignment.query_sequence[pileupread.query_position]][1] += 1
                variant.add_segment([pileupread.alignment.reference_name, pileupread.alignment.reference_start, str(pileupread.alignment.query_name) + ";" + str(clip)], [pileupread.alignment.query_sequence[pileupread.query_position], pileupread.alignment.query_qualities[pileupread.query_position]])
                total_n += 1
    if deletions < (NanoSV.opts_max_deletions * total_n):
        haplotypes = sorted(base_ratios.items(), key=lambda x: sum(x[1]))[-2:]
        try:
            if haplotypes[0][1][0] / sum(haplotypes[0][1]) > NanoSV.opts_min_occurences_of_highq_var and haplotypes[1][1][0] / sum(haplotypes[1][1]) > NanoSV.opts_min_occurences_of_highq_var:
                if sum(haplotypes[0][1]) / (sum(haplotypes[1][1]) + sum(haplotypes[0][1])) > NanoSV.opts_min_occurences_of_var:
                    bin = int(int(snp_position) / NanoSV.opts_variant_bin_size)
                    variants[chromosome][bin][int(snp_position)] = variant
        except ZeroDivisionError:
                ""
Exemplo n.º 3
0
def find_variations_cigar(cigartuples, chromosome, pos, qual, seq, subsegments):
    """
    Loops through read to find variant base. This function is used with LAST bam files
    :param cigartuples:
    :param chromosome:
    :param pos:
    :param qual:
    :param seq:
    :param subsegments:
    """
    global tmp_variants
    ref_cursor = (int(pos))
    read_cursor = 0
    for tuple in cigartuples:
        if tuple[0] == 4:
            read_cursor += tuple[1]
        if tuple[0] == 8:
            for mismatch in range(tuple[1]):
                ref_cursor += 1
                if ref_cursor not in tmp_variants:
                    tmp_variants[ref_cursor] = v.Variant(chromosome, ref_cursor)
                for segment in subsegments:
                    if segment.pos <= ref_cursor <= segment.end:
                        tmp_variants[ref_cursor].add_segment(segment.id, [seq[read_cursor], qual[read_cursor]])
                        break
                    if segment.end < ref_cursor:
                        del subsegments[subsegments.index(segment)]
                read_cursor += 1
        elif tuple[0] == 2:
            for deletion in range(tuple[1]):
                ref_cursor += 1
                if ref_cursor not in tmp_variants:
                    tmp_variants[ref_cursor] = v.Variant(chromosome, ref_cursor)
                for segment in subsegments:
                    if segment.pos <= ref_cursor <= segment.end:
                        tmp_variants[ref_cursor].add_segment(segment.id, ['-'])
                        break
                    if segment.end < ref_cursor:
                        del subsegments[subsegments.index(segment)]
        elif tuple[0] == 7:
            ref_cursor += tuple[1]
            read_cursor += tuple[1]
        elif tuple[0] == 1:
            read_cursor += tuple[1]