示例#1
0
def split_aligment(x, split_length=CIGAR_OPS_LIMIT):
    read_len = x.query_length
    ref = x.get_reference_sequence()
    cigar = butil.decompress_cigar_pairs(x.cigar)
    if read_len < split_length:
        return [x]

    ret = []

    prev_query_end = 0
    ref_len = 0

    for id, i in enumerate(range(0, len(cigar), split_length)):
        prefix_cigar = cigar[i:i + split_length]
        prefix_len = butil.get_read_len_from_cigar(prefix_cigar)

        prefix = deepcopy(x)
        prefix.reference_start += ref_len
        prefix.cigar = butil.compress_cigar(prefix_cigar, 'ints')

        prefix.set_tag('MD', butil.generate_md_tag(ref[ref_len:],
                                                   prefix.cigar))
        prefix.query_sequence = x.query_sequence[
            prev_query_end:prev_query_end + prefix_len]

        prev_query_end += prefix_len
        ref_len += butil.get_ref_len_from_cigar(prefix_cigar)

        ret.append(prefix)

    return ret
示例#2
0
def extend_cigar(read_seq, ref_seq, cigar_pairs, mode='ints'):
    cigar_str = butil.decompress_cigar_pairs(cigar_pairs, mode)

    ref_seq = butil.reference_align_string(ref_seq, cigar_pairs)
    read_seq = butil.query_align_string(read_seq, cigar_pairs)

    assert len(ref_seq) == len(cigar_str) and len(read_seq) == len(cigar_str)

    def _resolve_m(i, op):
        if op.upper() == 'M':
            return '=' if ref_seq[i].upper() == read_seq[i].upper() else 'X'
        return op.upper()

    cigar_str = ''.join(_resolve_m(*p) for p in enumerate(cigar_str))
    pairs = butil.compress_cigar(cigar_str)
    cigar = butil.cigar_pairs_to_str(pairs, 'chars')
    return cigar
示例#3
0
def get_target_sequences(sam_path):
    result_dict = {}

    cnt = defaultdict(int)
    with pysam.AlignmentFile(sam_path, "r") as samfile:
        for x in tqdm(samfile.fetch(), desc='Building ref'):
            name = x.query_name
            cnt['total'] += 1

            if x.is_unmapped:
                cnt['unmapped'] += 1
                #logging.warning("%s unmapped" % name)
                continue
            try:
                # hack to bypass segfault
                full_cigar = butil.decompress_cigar_pairs(x.cigartuples)
                r_len = butil.get_read_len_from_cigar(full_cigar)
                ref_len = butil.get_ref_len_from_cigar(full_cigar)

                if r_len != x.query_length or ref_len != x.reference_length:
                    logging.error(
                        "%s cigar operations do not match alignment info in md",
                        name)
                    cnt['invalid_md_cigar'] += 1
                    continue

                target = x.get_reference_sequence()
            except (ValueError, AssertionError) as e:
                cnt['missign_ref'] += 1
                logging.error(
                    "%s Mapped but reference len equals 0, md tag: %s", name,
                    x.has_tag('MD'))
                continue

            ref_name = x.reference_name
            length = x.reference_length
            start_pos = x.reference_start
            cigar_pairs = x.cigartuples

            if x.is_reverse:
                target = butil.reverse_complement(target)
                cigar_pairs = list(reversed(cigar_pairs))

            cigar_str = butil.decompress_cigar_pairs(cigar_pairs, mode='ints')

            if name in result_dict:
                prev_target, _, prev_start_pos, _, prev_cigar_str = result_dict[
                    name]
                merged = _merge_circular_aligment(prev_target, prev_start_pos,
                                                  prev_cigar_str, target,
                                                  start_pos, cigar_str,
                                                  x.is_reverse, x.query_name)
                if not merged:
                    continue

                target, start_pos, cigar_str = merged
                length = len(target)
            result_dict[name] = [
                target, ref_name, start_pos, length, cigar_str
            ]

    logging.warning("Results: %s", str(cnt.items()))
    return result_dict