def extract_seq(contig, strand, ref_clv, ref_fa, ctg_clv, window=50): """ extract the upstream sequence for PAS hexamer search :param ref_clv: would be necessary needs to combine sequence from both contig and reference genome :param ctg_clv: the position of clv in contig coordinate. """ seqname = contig.reference_name # TODO: this can be done in one step, but needs update a lot of tests if apautils.is_hardclipped(contig): ctg_seq = apautils.infer_query_sequence(contig, always=True) else: ctg_seq = contig.query_sequence cigartuples = contig.cigartuples args = cigartuples, ctg_seq, seqname, strand, ctg_clv, ref_clv, ref_fa, window if strand == '+': return xseq_plus.extract(*args) elif strand == '-': return xseq_minus.extract(*args) else: raise ValueError('unknown strand: "{0}"'.format(strand))
def gen_clv_record(contig, r2c_bam, tail_side, ref_fa): """ :param contig: suffix contig :param r2c_bam: pysam instance of read2genome alignment BAM :param tail_side (TODO, rename to tail_direction): 'left' or 'right' :param ref_fa: pysam instance of reference genome fasta. if provided, will also search PAS hexamer on reference genome. """ strand = calc_strand(tail_side) ref_clv = calc_ref_clv(contig, tail_side) ctg_tail_len = apautils.calc_tail_length(contig, tail_side) ctg_seq_len = contig.infer_query_length(always=True) ctg_clv = calc_ctg_clv(strand, ctg_seq_len, ctg_tail_len) num_suffix_reads, max_suffix_read_tail_len = analyze_suffix_reads( r2c_bam, contig, ctg_clv) ctg_hex, ctg_hex_id, ctg_hex_pos = gen_contig_hexamer_tuple( contig, strand, ref_clv, ref_fa, ctg_clv) ref_hex, ref_hex_id, ref_hex_pos = gen_reference_hexamer_tuple( ref_fa, contig.reference_name, strand, ref_clv) return ClvRecord( contig.reference_name, strand, ref_clv, ctg_hex, ctg_hex_id, ctg_hex_pos, ref_hex, ref_hex_id, ref_hex_pos, evidence_type='suffix', contig_id_at_pos='{0}@{1}'.format(contig.query_name, ctg_clv), contig_len=contig.query_length, contig_mapq=contig.mapq, contig_is_hardclipped=apautils.is_hardclipped(contig), num_suffix_reads=num_suffix_reads, max_suffix_read_tail_len=max_suffix_read_tail_len, suffix_contig_tail_len=ctg_tail_len, num_suffix_contigs=1, num_bridge_reads=0, max_bridge_read_tail_len=0, num_bridge_contigs=0, num_link_reads=0, num_link_contigs=0, num_blank_contigs=0, )
def gen_clv_record(contig, clv_key_tuple, num_link_reads, ref_fa): """ :param contig: link contig :clv_key_tuple: a tuple of (seqname, strand, cleavage_site_position) :param ref_fa: if provided, search PAS hexamer on reference genome, too """ seqname, strand, ref_clv = clv_key_tuple ctg_seq_len = contig.infer_query_length(always=True) ctg_clv = calc_ctg_clv(strand, ctg_seq_len) ctg_hex, ctg_hex_id, ctg_hex_pos = gen_contig_hexamer_tuple( contig, strand, ref_clv, ref_fa, ctg_clv) ref_hex, ref_hex_id, ref_hex_pos = gen_reference_hexamer_tuple( ref_fa, contig.reference_name, strand, ref_clv) return ClvRecord( seqname, strand, ref_clv, ctg_hex, ctg_hex_id, ctg_hex_pos, ref_hex, ref_hex_id, ref_hex_pos, evidence_type='link', contig_id_at_pos='{0}@{1}'.format(contig.query_name, ctg_clv), contig_len=contig.query_length, contig_mapq=contig.mapq, contig_is_hardclipped=apautils.is_hardclipped(contig), num_suffix_reads=0, max_suffix_read_tail_len=0, suffix_contig_tail_len=0, num_suffix_contigs=0, num_bridge_reads=0, max_bridge_read_tail_len=0, num_bridge_contigs=0, num_link_reads=num_link_reads, num_link_contigs=1, num_blank_contigs=0, )
def gen_two_clv_records(contig, ref_fa, already_supported_clv_keys): """ Assume there is still a clv at the 3' end of the contig even without any polya evidence, in thus case, there is no direction, so either end of the contig could be a clv. Hence add two to the function name explicitly """ seqname = contig.reference_name strands = ['+', '-'] ref_clv_candidates = [ contig.reference_end - 1, contig.reference_start ] is_hardclipped = apautils.is_hardclipped(contig) for strand, ref_clv in zip(strands, ref_clv_candidates): clv_key = apautils.gen_clv_key_tuple(seqname, strand, ref_clv) if clv_key in already_supported_clv_keys: continue ctg_seq_len = contig.infer_query_length(always=True) ctg_clv = calc_ctg_clv(strand, ctg_seq_len) ctg_hex, ctg_hex_id, ctg_hex_pos = gen_contig_hexamer_tuple( contig, strand, ref_clv, ref_fa, ctg_clv) ref_hex, ref_hex_id, ref_hex_pos = gen_reference_hexamer_tuple( ref_fa, contig.reference_name, strand, ref_clv) yield ClvRecord( contig.reference_name, strand, ref_clv, ctg_hex, ctg_hex_id, ctg_hex_pos, ref_hex, ref_hex_id, ref_hex_pos, evidence_type='blank', contig_id_at_pos='{0}@{1}'.format(contig.query_name, ctg_clv), contig_len=contig.query_length, contig_mapq=contig.mapq, contig_is_hardclipped=is_hardclipped, num_suffix_reads=0, max_suffix_read_tail_len=0, suffix_contig_tail_len=0, num_suffix_contigs=0, num_bridge_reads=0, max_bridge_read_tail_len=0, num_bridge_contigs=0, num_link_reads=0, num_link_contigs=0, num_blank_contigs=1, )