Exemplo n.º 1
0
def _merge_circular_aligment(target_1, start_pos_1, cigar_str_1, target_2,
                             start_pos_2, cigar_str_2, is_reversed, qname):

    if is_reversed:
        # reverse back both
        cigar_str_1 = ''.join(reversed(cigar_str_1))
        target_1 = butil.reverse_complement(target_1)

        cigar_str_2 = ''.join(reversed(cigar_str_2))
        target_2 = butil.reverse_complement(target_2)

    if start_pos_1 == 0:
        start = start_pos_2
        cigar = butil.rtrim_cigar(cigar_str_2) + butil.ltrim_cigar(cigar_str_1)
        target = target_2 + target_1

    elif start_pos_2 == 0:
        start = start_pos_1
        cigar = butil.rtrim_cigar(cigar_str_1) + butil.ltrim_cigar(cigar_str_2)
        target = target_1 + target_2

    else:
        # not circular, duplicate
        logging.error("Duplicate read with name %s", qname)
        return None

    if is_reversed:
        cigar = ''.join(reversed(cigar))
        target = butil.reverse_complement(target)

    return [target, start, cigar]
Exemplo n.º 2
0
def extend_cigars_in_sam(sam_in, ref_path, fastx_path, sam_out=None):
    tmp_dir = None
    tmp_sam_out = sam_out
    inplace = sam_out is None

    if inplace:
        # inplace change using tmp file
        tmp_dir = tempfile.mkdtemp()
        tmp_sam_out = os.path.join(tmp_dir, 'tmp.sam')

    ref = butil.read_fasta(ref_path)
    reads = {}

    with pysam.FastxFile(fastx_path, 'r') as fh:
        for r in fh:
            reads[r.name] = r

    with pysam.AlignmentFile(sam_in, "r") as in_sam, \
            pysam.AlignmentFile(tmp_sam_out, "w", template=in_sam) as out_sam:

        for x in tqdm(in_sam.fetch(), unit='reads'):
            if x.query_name not in reads:
                logging.warning("read %s in sam not found in .fastx",
                                x.query_name)
                continue

            if x.is_unmapped:
                logging.warning("read %s is unmapped, copy to out sam as is",
                                x.query_name)
                out_sam.write(x)
                continue

            read_seq = reads[x.query_name].sequence
            ref_seq = ref[x.reference_start:x.reference_end]
            cigar_pairs = x.cigartuples

            if x.is_reverse:
                read_seq = butil.reverse_complement(read_seq)

            x.cigarstring = extend_cigar(read_seq, ref_seq, cigar_pairs)
            out_sam.write(x)

    if inplace:
        # clear tmp files
        shutil.move(tmp_sam_out, sam_in)
        shutil.rmtree(tmp_dir)
Exemplo n.º 3
0
def get_target_sequences(sam_path):
    result_dict = {}

    cnt = defaultdict(int)
    with pysam.AlignmentFile(sam_path, "r") as samfile:
        for x in tqdm(samfile.fetch(), desc='Building ref'):
            name = x.query_name
            cnt['total'] += 1

            if x.is_unmapped:
                cnt['unmapped'] += 1
                #logging.warning("%s unmapped" % name)
                continue
            try:
                # hack to bypass segfault
                full_cigar = butil.decompress_cigar_pairs(x.cigartuples)
                r_len = butil.get_read_len_from_cigar(full_cigar)
                ref_len = butil.get_ref_len_from_cigar(full_cigar)

                if r_len != x.query_length or ref_len != x.reference_length:
                    logging.error(
                        "%s cigar operations do not match alignment info in md",
                        name)
                    cnt['invalid_md_cigar'] += 1
                    continue

                target = x.get_reference_sequence()
            except (ValueError, AssertionError) as e:
                cnt['missign_ref'] += 1
                logging.error(
                    "%s Mapped but reference len equals 0, md tag: %s", name,
                    x.has_tag('MD'))
                continue

            ref_name = x.reference_name
            length = x.reference_length
            start_pos = x.reference_start
            cigar_pairs = x.cigartuples

            if x.is_reverse:
                target = butil.reverse_complement(target)
                cigar_pairs = list(reversed(cigar_pairs))

            cigar_str = butil.decompress_cigar_pairs(cigar_pairs, mode='ints')

            if name in result_dict:
                prev_target, _, prev_start_pos, _, prev_cigar_str = result_dict[
                    name]
                merged = _merge_circular_aligment(prev_target, prev_start_pos,
                                                  prev_cigar_str, target,
                                                  start_pos, cigar_str,
                                                  x.is_reverse, x.query_name)
                if not merged:
                    continue

                target, start_pos, cigar_str = merged
                length = len(target)
            result_dict[name] = [
                target, ref_name, start_pos, length, cigar_str
            ]

    logging.warning("Results: %s", str(cnt.items()))
    return result_dict