Пример #1
0
def get_genomic_cate1(ele, order, cov_rat=0.75):
    block_type = ele[isoform_output_header_idx['blockType']].split(',')
    is_exon, is_exon_intron, is_intron, is_intergenic = False, False, False, True
    if 'E' in block_type and 'I' in block_type:
        is_exon_intron = True
        is_intergenic = False
    elif 'E' in block_type:
        is_exon = True
        is_intergenic = False
    elif 'I' in block_type:
        is_intron = True
        is_intergenic = False
    # non_primary = True if len(ele[isoform_output_header_idx['chrom']]) > 5 else False
    alu_len = 0 if ele[isoform_output_header_idx['Alu']] == 'NA' else int(
        ele[isoform_output_header_idx['Alu']])
    rRNA_len = 0 if ele[isoform_output_header_idx['rRNA']] == 'NA' else int(
        ele[isoform_output_header_idx['rRNA']])
    all_TE_len = 0 if ele[
        isoform_output_header_idx['allRepeat']] == 'NA' else int(
            ele[isoform_output_header_idx['allRepeat']])
    thres_len = cov_rat * int(ele[isoform_output_header_idx['refMapLen']])  # 0
    for cate in order:
        if (cate == 'Exon' and is_exon) or (cate == 'Exon&Intron' and is_exon_intron) or \
            (cate == 'Intron' and is_intron) or (cate == 'Intergenic' and is_intergenic) or \
            (cate == 'Alu' and alu_len > thres_len) or  (cate == 'rRNA' and rRNA_len > thres_len) or (cate == 'OtherRepeat' and all_TE_len > thres_len):
            return cate
    ut.fatal_format_time('get_genomic_cate1', 'No category is assigned.')
Пример #2
0
def get_sorted_exon_block_from_bed12(in_bed, bam_fn=''):
    if not bam_fn:
        ut.fatal_format_time("get_sorted_exon_block_from_bed12",
                             'No BAM header provided.')
    # chromStart: 0-base, exonStarts: 0-base
    header_ele = [
        'chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand',
        'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes',
        'exonStarts'
    ]
    bed_header = {header_ele[i]: i for i in range(len(header_ele))}
    ut.err_format_time("get_sorted_exon_block_from_bed12",
                       "Loading exon block from {} ...".format(in_bed))
    exon_block = []
    bam = ps.AlignmentFile(bam_fn) if bam_fn else None
    with open(in_bed, 'r') as bed:
        for line in bed:
            line = line.rstrip()
            if len(line) < 1: continue
            if line.startswith('#'): continue
            ele = line.rsplit('\t')
            chrom = ele[bed_header['chrom']]
            strand = ele[bed_header['strand']]
            start = int(ele[bed_header['chromStart']])
            start_array, len_array = ele[bed_header['exonStarts']].split(
                ','), ele[bed_header['blockSizes']].split(',')
            if '' in start_array: start_array.remove('')
            if '' in len_array: len_array.remove('')
            exon_start = [int(i) for i in start_array]
            exon_len = [int(i) for i in len_array]
            exon_block.append([(ele[bed_header['name']],
                                ele[bed_header['name']])])
            for s, l in zip(exon_start, exon_len):
                exon_block[-1].append((bam.get_tid(chrom), strand == '-',
                                       int(start + s + 1), int(start + s + l)))
    if bam: bam.close()

    exon_block = sorted(exon_block, key=lambda x: (x[1][0], x[1][2], x[-1][3]))
    # index
    block_index = {}
    last_tid, tid, i_start, i_end = -1, 0, 0, 0
    for i, block in enumerate(exon_block):
        exon = block[1]
        tid = exon[0]
        if tid != last_tid:
            if last_tid != -1:
                i_end = i
                block_index[last_tid] = (i_start, i_end)
            i_start = i
            last_tid = tid
    if last_tid != -1:
        i_end = len(exon_block)
        block_index[last_tid] = (i_start, i_end)
    ut.err_format_time("get_sorted_exon_block_from_bed12",
                       "Loading exon block from {} done!".format(in_bed))
    return exon_block, block_index
Пример #3
0
def get_input(in_fn, in_type, detailed):
    if detailed and (in_type == 'ciri' or in_type == 'bed'):
        ut.fatal_format_time(
            'get_input',
            'ciri does not support detailed alignment information.')
    if in_type == 'whole':
        return get_detailed_long_whole_input(
            in_fn) if detailed else get_long_whole_input(in_fn)
    elif in_type == 'isoform':
        return get_detailed_long_isoform_input(
            in_fn) if detailed else get_long_isoform_input(in_fn)
    elif in_type == 'ciri':
        return get_short_ciri_input(in_fn)
    elif in_type == 'bed':
        return get_short_bed_input(in_fn)
Пример #4
0
def stats_core(args):
    if args.type == 'all':
        if args.read_type == 'nano':
            nano_get_all_stats(args.samp_name, args.isoform_res,
                               args.stats_out)
        else:
            pb_get_all_stats(args.samp_name, args.isoform_res, args.stats_out)
    elif args.type == 'regionArea':
        classify_read_by_mapped_region(args.samp_name, args.isoform_res,
                                       args.stats_out)
    elif args.type == 'sharedKnownBSJ':
        get_shared_read_with_known_BSJ(args.samp_name, args.isoform_res,
                                       args.stats_out)
    else:
        ut.fatal_format_time('stats_core',
                             'Unknown stats type: {}.'.format(type))
Пример #5
0
def high_qual_record(r_array,
                     high_max_ratio=high_max_ratio,
                     high_min_ratio=high_min_ratio,
                     high_iden_ratio=high_iden_ratio):
    if not r_array: return None
    primary_r = r_array[0]
    # primary_r = ps.AlignedSegment()
    if primary_r.is_secondary or primary_r.is_supplementary or primary_r.is_unmapped:
        ut.fatal_format_time('high_qual_record',
                             'Error: input SAM file is sorted or modified.')
    primary_start = primary_r.reference_start + 1
    primary_end = primary_start + pb.get_ref_op_length(primary_r) - 1
    rlen = 0.0 + pb.get_read_op_length(primary_r)
    cons_len = rlen / 2

    best_i = -1
    best_r = None
    best_AS = -1
    best_iden_ratio = -1.0
    primary_is_high = False
    for i, r in iter(enumerate(r_array)):
        map_len = pb.get_aligned_read_length(r)
        mc = map_len / cons_len
        iden_ratio = get_iden_ratio(r)
        AS = int(r.get_tag('AS'))
        if high_min_ratio <= mc <= high_max_ratio and iden_ratio >= high_iden_ratio:
            if len(r.reference_name) >= 6 or r.reference_name.startswith(
                    'chrM') or r.reference_name.startswith('chrUn'):
                return None
            if AS > best_AS:
                if i == 0:
                    primary_is_high = True
                    best_r, best_i, best_AS = r, i, AS
                # if r is not primary record, r has to NOT overlap with primary r
                elif r.reference_name != primary_r.reference_name or (
                        r.reference_start + 1 > primary_end
                        or r.reference_start + pb.get_ref_op_length(r) <
                        primary_start):
                    best_r, best_i, best_AS = r, i, AS

    if best_i == -1:
        return None
    else:
        return primary_r if primary_is_high else best_r
Пример #6
0
def get_sorted_splice_junction_from_gtf(in_gtf, is_db, include_end, bam_fn=""):
    if not bam_fn:
        ut.fatal_format_time("get_sorted_splice_junction_from_gtf",
                             'No BAM header provided.')
    gtf_db = in_gtf if is_db else restore_gff_db(in_gtf)
    ut.err_format_time("get_sorted_splice_junction_from_gtf",
                       "Loading splice junction from {} ... ".format(in_gtf))
    sj = dict()
    bam = ps.AlignmentFile(bam_fn) if bam_fn else None
    for trans in gtf_db.features_of_type('transcript', order_by='start'):
        sj_start, sj_end = -1, -1
        tid = bam.get_tid(trans.chrom)
        is_rev = trans.strand == '-'
        for exon in gtf_db.children(trans,
                                    featuretype='exon',
                                    order_by='start'):
            sj_end = exon.start - 1
            if sj_start > 0 and sj_end > 0:
                sj[(tid, is_rev, sj_start, sj_end)] = 1
            sj_start = exon.end + 1
        if include_end:
            sj[(tid, is_rev, trans.end + 1, trans.start - 1)] = 1
    if bam: bam.close()

    sj_list = dict2list(sj)
    sj_list = sorted(sj_list, key=lambda x: (x[0], x[2], x[3]))

    # index
    sj_index = {}
    last_tid, tid, i_start, i_end = -1, 0, 0, 0
    for i, sj1 in enumerate(sj_list):
        tid = sj1[0]
        if tid != last_tid:
            if last_tid != -1:
                i_end = i - 1
                sj_index[last_tid] = (i_start, i_end)
            i_start = i
            last_tid = tid
    if last_tid != -1:
        i_end = i - 1
        sj_index[last_tid] = (i_start, i_end)
    ut.err_format_time("get_sorted_splice_junction_from_gtf",
                       "Loading splice junction from {} done!".format(in_gtf))
    return sj_list, sj_index
Пример #7
0
def get_iden_ratio(r):
    new_block = 1
    ins_len, del_len = 0, 0
    for tuples in r.cigartuples:
        if tuples[0] == pb.BAM_CINS:
            ins_len += tuples[1]
            if ins_len > max_ins_len: return -1.0
        elif tuples[0] == pb.BAM_CDEL:
            del_len += tuples[1]
            if del_len > max_del_len: return -1.0
        elif tuples[0] == pb.BAM_CREF_SKIP:
            ins_len, del_len = 0, 0
    map_len = pb.get_aligned_read_length(r)
    if not r.has_tag('NM'):
        ut.fatal_format_time('bam_classify', 'No NM tag found.\n')
    NM = int(r.get_tag('NM'))
    del_len = int(pb.get_cigar_len(r, pb.BAM_CDEL))
    iden_len = map_len - NM + del_len
    return iden_len / (map_len + 0.0)
Пример #8
0
def get_sorted_exon_block_from_gtf(in_gtf, is_db, bam_fn=''):
    if not bam_fn:
        ut.fatal_format_time("get_sorted_exon_block_from_gtf",
                             'No BAM header provided.')
    gtf_db = in_gtf if is_db else restore_gff_db(in_gtf)
    ut.err_format_time("get_sorted_exon_block_from_gtf",
                       "Loading exon block from {} ... ".format(in_gtf))
    exon_block_dict = dd(lambda: [])

    with ps.AlignmentFile(bam_fn) as bam:
        for exon in gtf_db.features_of_type('exon', order_by='start'):
            ID = (exon.attributes['transcript_id'][0],
                  exon.attributes['gene_id'][0])
            exon_block_dict[ID].append(
                (bam.get_tid(exon.chrom), exon.strand == '-', int(exon.start),
                 int(exon.end)))
    exon_block_list = dict2list(exon_block_dict)
    exon_block_list = sorted(exon_block_list,
                             key=lambda x: (x[1][0], x[1][2], x[-1][3]))

    # index
    block_index = {}
    last_tid, tid, i_start, i_end = -1, 0, 0, 0
    for i, block in enumerate(exon_block_list):
        exon = block[1]
        tid = exon[0]
        if tid != last_tid:
            if last_tid != -1:
                i_end = i - 1
                block_index[last_tid] = (i_start, i_end)
            i_start = i
            last_tid = tid
    if last_tid != -1:
        i_end = i - 1
        block_index[last_tid] = (i_start, i_end)
    ut.err_format_time("get_sorted_exon_block_from_gtf",
                       "Loading exon block from {} done!".format(in_gtf))
    return exon_block_list, block_index
Пример #9
0
def get_error_from_MD(cigartuples=[],
                      mdstr='',
                      full_query_seq='',
                      ref_start=0):
    mis, ins, dele = [], [], []
    last_error = ''
    md_i, m_pos = 0, 0
    mdSub = re.sub(r'([\\^][ACGTNacgtn]+)[0]*', ' \\1 ', mdstr)
    mdSplit = mdSub.rsplit()
    ref_pos, query_pos = ref_start, 0

    for tuples in cigartuples:
        if tuples[0] == BAM_CMATCH:
            m = mdSplit[md_i]

            if m.startswith('^'):
                ut.format_time(sys.stderr, 'get_error_from_MD',
                               'Unexpected MD string: {}\n'.format(mdstr))
                sys.exit(1)
            mSub = re.sub(r'([ACGTNacgtn])', ' \\1 ', m)
            m_len = sum(
                map(int, (re.sub(r'([ACGTNacgtn])', '1', mSub)).rsplit()))

            # from m_pos to m_pos + tuples[1]
            sub_ms = get_spec_MD(m, m_pos, m_pos + tuples[1])

            for ms in sub_ms:
                if ms.isalpha():  # MISMATCH
                    if full_query_seq[query_pos] != ms:
                        if last_error != 'MIS' or mis[-1][0] + mis[-1][
                                2] != query_pos:
                            mis_error = [
                                query_pos, ref_pos, 1,
                                full_query_seq[query_pos], ms
                            ]
                            mis.append(mis_error)
                        else:  # last_error == 'MIS' and  mis[-1][2] == ap[0] - 1:
                            mis[-1][-3] += 1
                            mis[-1][-2] += full_query_seq[query_pos]
                            mis[-1][-1] += ms
                        last_error = 'MIS'
                    else:
                        ut.fatal_format_time(
                            'get_error_from_MD',
                            'MIS error: {} v.s {}.'.format(
                                full_query_seq[query_pos], ms))
                    query_pos += 1
                    ref_pos += 1
                elif ms.isdigit():  # MATCH
                    query_pos += int(ms)
                    ref_pos += int(ms)

            if m_pos + tuples[1] == m_len:
                md_i += 1
                m_pos = 0
            elif m_pos + tuples[1] < m_len:
                m_pos += tuples[1]
            else:  #
                ut.format_time(sys.stderr, 'get_error_from_MD',
                               'Unexpected MD string: {}\n'.format(mdstr))
                sys.exit(1)
        elif tuples[0] == BAM_CDEL:
            m = mdSplit[md_i]
            if not m.startswith('^'):
                ut.format_time(sys.stderr, 'get_error_from_MD',
                               'Unexpected MD string: {}\n'.format(mdstr))
                sys.exit(1)
            del_error = [query_pos - 1, ref_pos, tuples[1], m[1:]]
            dele.append(del_error)
            ref_pos += tuples[1]
            last_error = 'DEL'
            md_i += 1
        elif tuples[0] == BAM_CINS:
            ins_error = [
                query_pos, ref_pos - 1, tuples[1],
                full_query_seq[query_pos:query_pos + tuples[1]]
            ]
            ins.append(ins_error)
            query_pos += tuples[1]
            last_error = 'INS'
        elif tuples[0] == BAM_CSOFT_CLIP or tuples[0] == BAM_CHARD_CLIP:
            query_pos += tuples[1]
        elif tuples[0] == BAM_CREF_SKIP:
            ref_pos += tuples[1]
        else:
            ut.format_time(sys.stderr, 'get_error_from_MD',
                           'Unexpected cigar: {}\n'.format(cigartuples))
            sys.exit(1)

    return mis, ins, dele