예제 #1
0
def extract_seq(contig, strand, ref_clv, ref_fa, ctg_clv, window=50):
    """
    extract the upstream sequence for PAS hexamer search

    :param ref_clv: would be necessary needs to combine sequence from both
                    contig and reference genome
    :param ctg_clv: the position of clv in contig coordinate.
    """
    seqname = contig.reference_name

    # TODO: this can be done in one step, but needs update a lot of tests
    if apautils.is_hardclipped(contig):
        ctg_seq = apautils.infer_query_sequence(contig, always=True)
    else:
        ctg_seq = contig.query_sequence

    cigartuples = contig.cigartuples

    args = cigartuples, ctg_seq, seqname, strand, ctg_clv, ref_clv, ref_fa, window
    if strand == '+':
        return xseq_plus.extract(*args)
    elif strand == '-':
        return xseq_minus.extract(*args)
    else:
        raise ValueError('unknown strand: "{0}"'.format(strand))
예제 #2
0
def gen_clv_record(contig, r2c_bam, tail_side, ref_fa):
    """
    :param contig: suffix contig
    :param r2c_bam: pysam instance of read2genome alignment BAM
    :param tail_side (TODO, rename to tail_direction): 'left' or 'right'
    :param ref_fa: pysam instance of reference genome fasta. if provided,
                   will also search PAS hexamer on reference genome.
    """
    strand = calc_strand(tail_side)
    ref_clv = calc_ref_clv(contig, tail_side)
    ctg_tail_len = apautils.calc_tail_length(contig, tail_side)
    ctg_seq_len = contig.infer_query_length(always=True)
    ctg_clv = calc_ctg_clv(strand, ctg_seq_len, ctg_tail_len)

    num_suffix_reads, max_suffix_read_tail_len = analyze_suffix_reads(
        r2c_bam, contig, ctg_clv)

    ctg_hex, ctg_hex_id, ctg_hex_pos = gen_contig_hexamer_tuple(
        contig, strand, ref_clv, ref_fa, ctg_clv)

    ref_hex, ref_hex_id, ref_hex_pos = gen_reference_hexamer_tuple(
        ref_fa, contig.reference_name, strand, ref_clv)

    return ClvRecord(
        contig.reference_name,
        strand,
        ref_clv,
        ctg_hex,
        ctg_hex_id,
        ctg_hex_pos,
        ref_hex,
        ref_hex_id,
        ref_hex_pos,
        evidence_type='suffix',
        contig_id_at_pos='{0}@{1}'.format(contig.query_name, ctg_clv),
        contig_len=contig.query_length,
        contig_mapq=contig.mapq,
        contig_is_hardclipped=apautils.is_hardclipped(contig),
        num_suffix_reads=num_suffix_reads,
        max_suffix_read_tail_len=max_suffix_read_tail_len,
        suffix_contig_tail_len=ctg_tail_len,
        num_suffix_contigs=1,
        num_bridge_reads=0,
        max_bridge_read_tail_len=0,
        num_bridge_contigs=0,
        num_link_reads=0,
        num_link_contigs=0,
        num_blank_contigs=0,
    )
예제 #3
0
파일: link.py 프로젝트: zyxue/kleat
def gen_clv_record(contig, clv_key_tuple, num_link_reads, ref_fa):
    """
    :param contig: link contig
    :clv_key_tuple: a tuple of (seqname, strand, cleavage_site_position)
    :param ref_fa: if provided, search PAS hexamer on reference genome, too
    """
    seqname, strand, ref_clv = clv_key_tuple
    ctg_seq_len = contig.infer_query_length(always=True)
    ctg_clv = calc_ctg_clv(strand, ctg_seq_len)

    ctg_hex, ctg_hex_id, ctg_hex_pos = gen_contig_hexamer_tuple(
        contig, strand, ref_clv, ref_fa, ctg_clv)

    ref_hex, ref_hex_id, ref_hex_pos = gen_reference_hexamer_tuple(
        ref_fa, contig.reference_name, strand, ref_clv)

    return ClvRecord(
        seqname,
        strand,
        ref_clv,
        ctg_hex,
        ctg_hex_id,
        ctg_hex_pos,
        ref_hex,
        ref_hex_id,
        ref_hex_pos,
        evidence_type='link',
        contig_id_at_pos='{0}@{1}'.format(contig.query_name, ctg_clv),
        contig_len=contig.query_length,
        contig_mapq=contig.mapq,
        contig_is_hardclipped=apautils.is_hardclipped(contig),
        num_suffix_reads=0,
        max_suffix_read_tail_len=0,
        suffix_contig_tail_len=0,
        num_suffix_contigs=0,
        num_bridge_reads=0,
        max_bridge_read_tail_len=0,
        num_bridge_contigs=0,
        num_link_reads=num_link_reads,
        num_link_contigs=1,
        num_blank_contigs=0,
    )
예제 #4
0
def gen_two_clv_records(contig, ref_fa, already_supported_clv_keys):
    """
    Assume there is still a clv at the 3' end of the contig even without any
    polya evidence, in thus case, there is no direction, so either end of the
    contig could be a clv. Hence add two to the function name explicitly
    """
    seqname = contig.reference_name
    strands = ['+', '-']
    ref_clv_candidates = [
        contig.reference_end - 1,
        contig.reference_start
    ]
    is_hardclipped = apautils.is_hardclipped(contig)

    for strand, ref_clv in zip(strands, ref_clv_candidates):
        clv_key = apautils.gen_clv_key_tuple(seqname, strand, ref_clv)
        if clv_key in already_supported_clv_keys:
            continue

        ctg_seq_len = contig.infer_query_length(always=True)
        ctg_clv = calc_ctg_clv(strand, ctg_seq_len)

        ctg_hex, ctg_hex_id, ctg_hex_pos = gen_contig_hexamer_tuple(
            contig, strand, ref_clv, ref_fa, ctg_clv)

        ref_hex, ref_hex_id, ref_hex_pos = gen_reference_hexamer_tuple(
            ref_fa, contig.reference_name, strand, ref_clv)

        yield ClvRecord(
            contig.reference_name,
            strand,
            ref_clv,

            ctg_hex,
            ctg_hex_id,
            ctg_hex_pos,

            ref_hex,
            ref_hex_id,
            ref_hex_pos,

            evidence_type='blank',
            contig_id_at_pos='{0}@{1}'.format(contig.query_name, ctg_clv),
            contig_len=contig.query_length,
            contig_mapq=contig.mapq,
            contig_is_hardclipped=is_hardclipped,

            num_suffix_reads=0,
            max_suffix_read_tail_len=0,
            suffix_contig_tail_len=0,
            num_suffix_contigs=0,

            num_bridge_reads=0,
            max_bridge_read_tail_len=0,
            num_bridge_contigs=0,

            num_link_reads=0,
            num_link_contigs=0,

            num_blank_contigs=1,
        )