def get_genomic_cate1(ele, order, cov_rat=0.75): block_type = ele[isoform_output_header_idx['blockType']].split(',') is_exon, is_exon_intron, is_intron, is_intergenic = False, False, False, True if 'E' in block_type and 'I' in block_type: is_exon_intron = True is_intergenic = False elif 'E' in block_type: is_exon = True is_intergenic = False elif 'I' in block_type: is_intron = True is_intergenic = False # non_primary = True if len(ele[isoform_output_header_idx['chrom']]) > 5 else False alu_len = 0 if ele[isoform_output_header_idx['Alu']] == 'NA' else int( ele[isoform_output_header_idx['Alu']]) rRNA_len = 0 if ele[isoform_output_header_idx['rRNA']] == 'NA' else int( ele[isoform_output_header_idx['rRNA']]) all_TE_len = 0 if ele[ isoform_output_header_idx['allRepeat']] == 'NA' else int( ele[isoform_output_header_idx['allRepeat']]) thres_len = cov_rat * int(ele[isoform_output_header_idx['refMapLen']]) # 0 for cate in order: if (cate == 'Exon' and is_exon) or (cate == 'Exon&Intron' and is_exon_intron) or \ (cate == 'Intron' and is_intron) or (cate == 'Intergenic' and is_intergenic) or \ (cate == 'Alu' and alu_len > thres_len) or (cate == 'rRNA' and rRNA_len > thres_len) or (cate == 'OtherRepeat' and all_TE_len > thres_len): return cate ut.fatal_format_time('get_genomic_cate1', 'No category is assigned.')
def get_sorted_exon_block_from_bed12(in_bed, bam_fn=''): if not bam_fn: ut.fatal_format_time("get_sorted_exon_block_from_bed12", 'No BAM header provided.') # chromStart: 0-base, exonStarts: 0-base header_ele = [ 'chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'exonStarts' ] bed_header = {header_ele[i]: i for i in range(len(header_ele))} ut.err_format_time("get_sorted_exon_block_from_bed12", "Loading exon block from {} ...".format(in_bed)) exon_block = [] bam = ps.AlignmentFile(bam_fn) if bam_fn else None with open(in_bed, 'r') as bed: for line in bed: line = line.rstrip() if len(line) < 1: continue if line.startswith('#'): continue ele = line.rsplit('\t') chrom = ele[bed_header['chrom']] strand = ele[bed_header['strand']] start = int(ele[bed_header['chromStart']]) start_array, len_array = ele[bed_header['exonStarts']].split( ','), ele[bed_header['blockSizes']].split(',') if '' in start_array: start_array.remove('') if '' in len_array: len_array.remove('') exon_start = [int(i) for i in start_array] exon_len = [int(i) for i in len_array] exon_block.append([(ele[bed_header['name']], ele[bed_header['name']])]) for s, l in zip(exon_start, exon_len): exon_block[-1].append((bam.get_tid(chrom), strand == '-', int(start + s + 1), int(start + s + l))) if bam: bam.close() exon_block = sorted(exon_block, key=lambda x: (x[1][0], x[1][2], x[-1][3])) # index block_index = {} last_tid, tid, i_start, i_end = -1, 0, 0, 0 for i, block in enumerate(exon_block): exon = block[1] tid = exon[0] if tid != last_tid: if last_tid != -1: i_end = i block_index[last_tid] = (i_start, i_end) i_start = i last_tid = tid if last_tid != -1: i_end = len(exon_block) block_index[last_tid] = (i_start, i_end) ut.err_format_time("get_sorted_exon_block_from_bed12", "Loading exon block from {} done!".format(in_bed)) return exon_block, block_index
def get_input(in_fn, in_type, detailed): if detailed and (in_type == 'ciri' or in_type == 'bed'): ut.fatal_format_time( 'get_input', 'ciri does not support detailed alignment information.') if in_type == 'whole': return get_detailed_long_whole_input( in_fn) if detailed else get_long_whole_input(in_fn) elif in_type == 'isoform': return get_detailed_long_isoform_input( in_fn) if detailed else get_long_isoform_input(in_fn) elif in_type == 'ciri': return get_short_ciri_input(in_fn) elif in_type == 'bed': return get_short_bed_input(in_fn)
def stats_core(args): if args.type == 'all': if args.read_type == 'nano': nano_get_all_stats(args.samp_name, args.isoform_res, args.stats_out) else: pb_get_all_stats(args.samp_name, args.isoform_res, args.stats_out) elif args.type == 'regionArea': classify_read_by_mapped_region(args.samp_name, args.isoform_res, args.stats_out) elif args.type == 'sharedKnownBSJ': get_shared_read_with_known_BSJ(args.samp_name, args.isoform_res, args.stats_out) else: ut.fatal_format_time('stats_core', 'Unknown stats type: {}.'.format(type))
def high_qual_record(r_array, high_max_ratio=high_max_ratio, high_min_ratio=high_min_ratio, high_iden_ratio=high_iden_ratio): if not r_array: return None primary_r = r_array[0] # primary_r = ps.AlignedSegment() if primary_r.is_secondary or primary_r.is_supplementary or primary_r.is_unmapped: ut.fatal_format_time('high_qual_record', 'Error: input SAM file is sorted or modified.') primary_start = primary_r.reference_start + 1 primary_end = primary_start + pb.get_ref_op_length(primary_r) - 1 rlen = 0.0 + pb.get_read_op_length(primary_r) cons_len = rlen / 2 best_i = -1 best_r = None best_AS = -1 best_iden_ratio = -1.0 primary_is_high = False for i, r in iter(enumerate(r_array)): map_len = pb.get_aligned_read_length(r) mc = map_len / cons_len iden_ratio = get_iden_ratio(r) AS = int(r.get_tag('AS')) if high_min_ratio <= mc <= high_max_ratio and iden_ratio >= high_iden_ratio: if len(r.reference_name) >= 6 or r.reference_name.startswith( 'chrM') or r.reference_name.startswith('chrUn'): return None if AS > best_AS: if i == 0: primary_is_high = True best_r, best_i, best_AS = r, i, AS # if r is not primary record, r has to NOT overlap with primary r elif r.reference_name != primary_r.reference_name or ( r.reference_start + 1 > primary_end or r.reference_start + pb.get_ref_op_length(r) < primary_start): best_r, best_i, best_AS = r, i, AS if best_i == -1: return None else: return primary_r if primary_is_high else best_r
def get_sorted_splice_junction_from_gtf(in_gtf, is_db, include_end, bam_fn=""): if not bam_fn: ut.fatal_format_time("get_sorted_splice_junction_from_gtf", 'No BAM header provided.') gtf_db = in_gtf if is_db else restore_gff_db(in_gtf) ut.err_format_time("get_sorted_splice_junction_from_gtf", "Loading splice junction from {} ... ".format(in_gtf)) sj = dict() bam = ps.AlignmentFile(bam_fn) if bam_fn else None for trans in gtf_db.features_of_type('transcript', order_by='start'): sj_start, sj_end = -1, -1 tid = bam.get_tid(trans.chrom) is_rev = trans.strand == '-' for exon in gtf_db.children(trans, featuretype='exon', order_by='start'): sj_end = exon.start - 1 if sj_start > 0 and sj_end > 0: sj[(tid, is_rev, sj_start, sj_end)] = 1 sj_start = exon.end + 1 if include_end: sj[(tid, is_rev, trans.end + 1, trans.start - 1)] = 1 if bam: bam.close() sj_list = dict2list(sj) sj_list = sorted(sj_list, key=lambda x: (x[0], x[2], x[3])) # index sj_index = {} last_tid, tid, i_start, i_end = -1, 0, 0, 0 for i, sj1 in enumerate(sj_list): tid = sj1[0] if tid != last_tid: if last_tid != -1: i_end = i - 1 sj_index[last_tid] = (i_start, i_end) i_start = i last_tid = tid if last_tid != -1: i_end = i - 1 sj_index[last_tid] = (i_start, i_end) ut.err_format_time("get_sorted_splice_junction_from_gtf", "Loading splice junction from {} done!".format(in_gtf)) return sj_list, sj_index
def get_iden_ratio(r): new_block = 1 ins_len, del_len = 0, 0 for tuples in r.cigartuples: if tuples[0] == pb.BAM_CINS: ins_len += tuples[1] if ins_len > max_ins_len: return -1.0 elif tuples[0] == pb.BAM_CDEL: del_len += tuples[1] if del_len > max_del_len: return -1.0 elif tuples[0] == pb.BAM_CREF_SKIP: ins_len, del_len = 0, 0 map_len = pb.get_aligned_read_length(r) if not r.has_tag('NM'): ut.fatal_format_time('bam_classify', 'No NM tag found.\n') NM = int(r.get_tag('NM')) del_len = int(pb.get_cigar_len(r, pb.BAM_CDEL)) iden_len = map_len - NM + del_len return iden_len / (map_len + 0.0)
def get_sorted_exon_block_from_gtf(in_gtf, is_db, bam_fn=''): if not bam_fn: ut.fatal_format_time("get_sorted_exon_block_from_gtf", 'No BAM header provided.') gtf_db = in_gtf if is_db else restore_gff_db(in_gtf) ut.err_format_time("get_sorted_exon_block_from_gtf", "Loading exon block from {} ... ".format(in_gtf)) exon_block_dict = dd(lambda: []) with ps.AlignmentFile(bam_fn) as bam: for exon in gtf_db.features_of_type('exon', order_by='start'): ID = (exon.attributes['transcript_id'][0], exon.attributes['gene_id'][0]) exon_block_dict[ID].append( (bam.get_tid(exon.chrom), exon.strand == '-', int(exon.start), int(exon.end))) exon_block_list = dict2list(exon_block_dict) exon_block_list = sorted(exon_block_list, key=lambda x: (x[1][0], x[1][2], x[-1][3])) # index block_index = {} last_tid, tid, i_start, i_end = -1, 0, 0, 0 for i, block in enumerate(exon_block_list): exon = block[1] tid = exon[0] if tid != last_tid: if last_tid != -1: i_end = i - 1 block_index[last_tid] = (i_start, i_end) i_start = i last_tid = tid if last_tid != -1: i_end = i - 1 block_index[last_tid] = (i_start, i_end) ut.err_format_time("get_sorted_exon_block_from_gtf", "Loading exon block from {} done!".format(in_gtf)) return exon_block_list, block_index
def get_error_from_MD(cigartuples=[], mdstr='', full_query_seq='', ref_start=0): mis, ins, dele = [], [], [] last_error = '' md_i, m_pos = 0, 0 mdSub = re.sub(r'([\\^][ACGTNacgtn]+)[0]*', ' \\1 ', mdstr) mdSplit = mdSub.rsplit() ref_pos, query_pos = ref_start, 0 for tuples in cigartuples: if tuples[0] == BAM_CMATCH: m = mdSplit[md_i] if m.startswith('^'): ut.format_time(sys.stderr, 'get_error_from_MD', 'Unexpected MD string: {}\n'.format(mdstr)) sys.exit(1) mSub = re.sub(r'([ACGTNacgtn])', ' \\1 ', m) m_len = sum( map(int, (re.sub(r'([ACGTNacgtn])', '1', mSub)).rsplit())) # from m_pos to m_pos + tuples[1] sub_ms = get_spec_MD(m, m_pos, m_pos + tuples[1]) for ms in sub_ms: if ms.isalpha(): # MISMATCH if full_query_seq[query_pos] != ms: if last_error != 'MIS' or mis[-1][0] + mis[-1][ 2] != query_pos: mis_error = [ query_pos, ref_pos, 1, full_query_seq[query_pos], ms ] mis.append(mis_error) else: # last_error == 'MIS' and mis[-1][2] == ap[0] - 1: mis[-1][-3] += 1 mis[-1][-2] += full_query_seq[query_pos] mis[-1][-1] += ms last_error = 'MIS' else: ut.fatal_format_time( 'get_error_from_MD', 'MIS error: {} v.s {}.'.format( full_query_seq[query_pos], ms)) query_pos += 1 ref_pos += 1 elif ms.isdigit(): # MATCH query_pos += int(ms) ref_pos += int(ms) if m_pos + tuples[1] == m_len: md_i += 1 m_pos = 0 elif m_pos + tuples[1] < m_len: m_pos += tuples[1] else: # ut.format_time(sys.stderr, 'get_error_from_MD', 'Unexpected MD string: {}\n'.format(mdstr)) sys.exit(1) elif tuples[0] == BAM_CDEL: m = mdSplit[md_i] if not m.startswith('^'): ut.format_time(sys.stderr, 'get_error_from_MD', 'Unexpected MD string: {}\n'.format(mdstr)) sys.exit(1) del_error = [query_pos - 1, ref_pos, tuples[1], m[1:]] dele.append(del_error) ref_pos += tuples[1] last_error = 'DEL' md_i += 1 elif tuples[0] == BAM_CINS: ins_error = [ query_pos, ref_pos - 1, tuples[1], full_query_seq[query_pos:query_pos + tuples[1]] ] ins.append(ins_error) query_pos += tuples[1] last_error = 'INS' elif tuples[0] == BAM_CSOFT_CLIP or tuples[0] == BAM_CHARD_CLIP: query_pos += tuples[1] elif tuples[0] == BAM_CREF_SKIP: ref_pos += tuples[1] else: ut.format_time(sys.stderr, 'get_error_from_MD', 'Unexpected cigar: {}\n'.format(cigartuples)) sys.exit(1) return mis, ins, dele