Exemplo n.º 1
0
def parse_ref(ref_file, flag):
    if flag == 1:
        genes = defaultdict(list)
        novel_genes = defaultdict(list)
        gene_info = {}
        chrom_info = set()
    else:
        genes = {}
    with open(ref_file, 'r') as f:
        for line in f:
            gene_id, iso_id, chrom, strand = line.split()[:4]
            total_id = '\t'.join(['iso', gene_id, iso_id, chrom, strand])
            starts = [int(x) for x in line.split()[9].split(',')[:-1]]
            ends = [int(x) for x in line.split()[10].split(',')[:-1]]
            start = starts[0]
            end = ends[-1]
            if flag == 1:
                if iso_id.startswith('CUFF'):
                    novel_genes[chrom].append([start, end, total_id])
                else:
                    genes[chrom].append([start, end, total_id])
                gene_info[total_id] = [starts, ends]
            else:
                genes['\t'.join([gene_id, iso_id, chrom,
                                 strand])] = [starts, ends]
    if flag == 1:
        for chrom in genes:
            genes[chrom] = Interval(genes[chrom])
            chrom_info.add(chrom)
        for chrom in novel_genes:
            novel_genes[chrom] = Interval(novel_genes[chrom])
            chrom_info.add(chrom)
        return (genes, novel_genes, gene_info, chrom_info)
    else:
        return genes
Exemplo n.º 2
0
def annotate_fusion(ref_f, input_f, output_f):
    """
    Align fusion juncrions to gene annotations
    """
    print('Start to annotate fusion junctions...')
    genes, gene_info = parse_ref1(ref_f)  # gene annotations
    fusions, fusion_index = parse_bed(input_f)  # fusion junctions
    total = set()
    with open(output_f, 'w') as outf:
        for chrom in genes:
            # overlap gene annotations with fusion juncrions
            result = Interval.overlapwith(genes[chrom].interval,
                                          fusions[chrom])
            for itl in result:
                # extract gene annotations
                iso = list(filter(lambda x: x.startswith('iso'), itl[2:]))
                # for each overlapped fusion junction
                for fus in itl[(2 + len(iso)):]:
                    reads = fus.split()[1]
                    fus_start, fus_end = fusion_index[fus]
                    edge_annotations = []  # first or last exon flag
                    for iso_id in iso:
                        g, i, c, s = iso_id.split()[1:]
                        start = gene_info[iso_id][0][0]
                        end = gene_info[iso_id][-1][-1]
                        # fusion junction excesses boundaries of gene
                        # annotation
                        if fus_start < start - 10 or fus_end > end + 10:
                            continue
                        (fusion_info,
                         index,
                         edge) = map_fusion_to_iso(fus_start,
                                                   fus_end, s,
                                                   gene_info[iso_id])
                        if fusion_info:
                            fus_start_str = str(fus_start)
                            fus_end_str = str(fus_end)
                            bed_info = '\t'.join([chrom, fus_start_str,
                                                  fus_end_str,
                                                  'FUSIONJUNC/%s' % reads,
                                                  '0', s, fus_start_str,
                                                  fus_start_str, '0,0,0'])
                            bed = '\t'.join([bed_info, fusion_info, g, i,
                                             index])
                            if not edge:  # not first or last exon
                                outf.write(bed + '\n')
                                total.add(fus)
                            else:  # first or last exon
                                edge_annotations.append(bed)
                    if edge_annotations:  # first or last exon
                        for bed in edge_annotations:
                            outf.write(bed + '\n')
                        total.add(fus)
    print('Annotated %d fusion junctions!' % len(total))
Exemplo n.º 3
0
def annotate_fusion(ref_f, input_f, output_f):
    """
    Align fusion juncrions to gene annotations
    """
    print('Start to annotate fusion junctions...')
    genes, gene_info = parse_ref1(ref_f)  # gene annotations
    fusions, fusion_index = parse_bed(input_f)  # fusion junctions
    total = set()
    with open(output_f, 'w') as outf:
        for chrom in genes:
            # overlap gene annotations with fusion juncrions
            result = Interval.overlapwith(genes[chrom].interval,
                                          fusions[chrom])
            for itl in result:
                # extract gene annotations
                iso = list(filter(lambda x: x.startswith('iso'), itl[2:]))
                # for each overlapped fusion junction
                for fus in itl[(2 + len(iso)):]:
                    reads = fus.split()[1]
                    fus_start, fus_end = fusion_index[fus]
                    edge_annotations = []  # first or last exon flag
                    for iso_id in iso:
                        g, i, c, s = iso_id.split()[1:]
                        start = gene_info[iso_id][0][0]
                        end = gene_info[iso_id][-1][-1]
                        # fusion junction excesses boundaries of gene
                        # annotation
                        if fus_start < start - 10 or fus_end > end + 10:
                            continue
                        (fusion_info, index,
                         edge) = map_fusion_to_iso(fus_start, fus_end, s,
                                                   gene_info[iso_id])
                        if fusion_info:
                            fus_start_str = str(fus_start)
                            fus_end_str = str(fus_end)
                            bed_info = '\t'.join([
                                chrom, fus_start_str, fus_end_str,
                                'FUSIONJUNC/%s' % reads, '0', s, fus_start_str,
                                fus_start_str, '0,0,0'
                            ])
                            bed = '\t'.join(
                                [bed_info, fusion_info, g, i, index])
                            if not edge:  # not first or last exon
                                outf.write(bed + '\n')
                                total.add(fus)
                            else:  # first or last exon
                                edge_annotations.append(bed)
                    if edge_annotations:  # first or last exon
                        for bed in edge_annotations:
                            outf.write(bed + '\n')
                        total.add(fus)
    print('Annotated %d fusion junctions!' % len(total))
Exemplo n.º 4
0
def parse_ref1(ref_file):
    genes = defaultdict(list)
    gene_info = {}
    with open(ref_file, 'r') as f:
        for line in f:
            gene_id, iso_id, chrom, strand = line.split()[:4]
            total_id = '\t'.join(['iso', gene_id, iso_id, chrom, strand])
            starts = [int(x) for x in line.split()[9].split(',')[:-1]]
            ends = [int(x) for x in line.split()[10].split(',')[:-1]]
            start = starts[0]
            end = ends[-1]
            genes[chrom].append([start, end, total_id])
            gene_info[total_id] = [starts, ends]
    for chrom in genes:
        genes[chrom] = Interval(genes[chrom])
    return (genes, gene_info)
Exemplo n.º 5
0
def annotate_fusion(ref_f, junc_bed, secondary_flag=0, denovo_flag=0):
    """
    Align fusion juncrions to gene annotations
    """
    print('Start to annotate fusion junctions...')
    # gene annotations
    genes, novel_genes, gene_info, chrom_info = parse_ref(ref_f, 1)
    fusion_bed = junc_bed
    fusions, fusion_index = parse_bed(fusion_bed)  # fusion junctions
    total = set()
    annotated_fusion_f = 'annotated_fusion.txt.tmp'
    with open(annotated_fusion_f, 'w') as outf:
        for chrom in chrom_info:
            # overlap gene annotations with fusion juncrions
            result = []
            # overlap genes
            if chrom in genes:
                result += Interval.overlapwith(genes[chrom].interval,
                                               fusions[chrom])
            # overlap novel genes in denovo mode
            if denovo_flag and chrom in novel_genes:
                result += Interval.overlapwith(novel_genes[chrom].interval,
                                               fusions[chrom])
            for itl in result:
                # extract gene annotations
                iso = list([x for x in itl[2:] if x.startswith('iso')])
                # for each overlapped fusion junction
                for fus in itl[(2 + len(iso)):]:
                    reads = fus.split()[1]
                    fus_start, fus_end = fusion_index[fus]
                    fus_loc = '%s\t%d\t%d\tFUSIONJUNC/%s' % (chrom, fus_start,
                                                             fus_end, reads)
                    edge_annotations = []  # first or last exon flag
                    secondary_exon = defaultdict(dict)  # secondary exons
                    annotate_flag = 0
                    for iso_id in iso:
                        g, i, c, s = iso_id.split()[1:]
                        start = gene_info[iso_id][0][0]
                        end = gene_info[iso_id][-1][-1]
                        # fusion junction excesses boundaries of gene
                        # annotation
                        if fus_start < start - 10 or fus_end > end + 10:
                            if not secondary_flag:
                                continue
                        (fusion_info,
                         index,
                         edge,
                         secondary) = map_fusion_to_iso(fus_start,
                                                        fus_end, s,
                                                        gene_info[iso_id])
                        if fusion_info:
                            annotate_flag += 1
                            bed_info = '\t'.join([fus_loc, '0', s,
                                                  str(fus_start),
                                                  str(fus_start), '0,0,0'])
                            bed = '\t'.join([bed_info, fusion_info, g, i,
                                             index])
                            if not edge:  # not first or last exon
                                outf.write(bed + '\n')
                                total.add(fus)
                            else:  # first or last exon
                                edge_annotations.append(bed)
                        elif secondary_flag and secondary is not None:
                            li, ri = secondary
                            gene = ':'.join([g, s])
                            if li is not None:
                                li = str(li)
                                secondary_exon['left'][gene] = ':'.join([i,
                                                                         li])
                            if ri is not None:
                                ri = str(ri)
                                secondary_exon['right'][gene] = ':'.join([i,
                                                                          ri])
                    if edge_annotations:
                        for bed in edge_annotations:
                            outf.write(bed + '\n')
                        total.add(fus)
                    if secondary_flag and not annotate_flag:
                        for gene in secondary_exon['left']:
                            if gene in secondary_exon['right']:
                                left = secondary_exon['left'][gene]
                                right = secondary_exon['right'][gene]
                                g, s = gene.split(':')
                                # for avoid dup, use fus_loc_new
                                fus_loc_new = fus_loc + '\t0\t%s' % s
                                outf.write('%s\t%s:%s\t%s:%s\n' % (fus_loc_new,
                                                                   g, left, g,
                                                                   right))
    print('Annotated %d fusion junctions!' % len(total))
Exemplo n.º 6
0
def extract_retained_intron(denovo_dir, tophat_dir, pAplus_dir, output_dir):
    """
    Check each intron and fetch PIR
    Modified from Braunschweig et al., Genome Research, 2014, gr-177790.
    """
    print('Start to parse circular RNA introns...')
    # set path
    fusion_f = '%s/circularRNA_full.txt' % denovo_dir
    pAminus_junc_f = tophat_dir + '/junctions.bed'
    pAminus_junc = parse_junc(pAminus_junc_f)
    pAminus_bam_f = tophat_dir + '/accepted_hits.bam'
    pAminus_bam = pysam.AlignmentFile(pAminus_bam_f, 'rb')
    pAplus_junc_f = '%s/junctions.bed' % pAplus_dir
    pAplus_junc = parse_junc(pAplus_junc_f)
    pAplus_bam_f = '%s/accepted_hits.bam' % pAplus_dir
    pAplus_bam = pysam.AlignmentFile(pAplus_bam_f, 'rb')
    excluded_region = defaultdict(list)
    novel_region = defaultdict(list)
    intron = defaultdict(list)
    intron_list = set()
    intron_info_list = {}
    with open(fusion_f, 'r') as f:
        for line in f:
            chrom, start, end = line.split()[:3]
            start = int(start)
            end = int(end)
            strand = line.split()[5]
            circ_type = line.split()[13]
            if circ_type == 'ciRNA':  # not check ciRNAs
                excluded_region[chrom].append([start, end])
                continue
            sizes = [int(x) for x in line.split()[10].split(',')]
            offsets = [int(x) for x in line.split()[11].split(',')]
            reads = line.split()[12]
            gene, iso = line.split()[14:16]
            for s, o in zip(sizes, offsets):
                if gene.startswith('CUFF'):
                    novel_region[chrom].append([start + o, start + o + s])
                else:
                    excluded_region[chrom].append([start + o, start + o + s])
            if gene.startswith('CUFF'):  # only check annotated introns
                continue
            num = int(line.split()[9])
            for i in range(num - 1):
                sta = start + offsets[i] + sizes[i]
                end = start + offsets[i + 1]
                if end - sta == 0:
                    continue
                intron_info = '%s\t%d\t%d\t%s' % (chrom, sta, end, strand)
                if intron_info in intron_list:
                    if int(reads) > int(intron_info_list[intron_info][2]):
                        intron_info_list[intron_info] = [gene, iso, reads]
                    continue
                intron[chrom].append([sta, end, intron_info])
                intron_list.add(intron_info)
                intron_info_list[intron_info] = [gene, iso, reads]
    intron_set = set()
    for chrom in excluded_region:
        intron_region = []
        # retain introns covered by novel assembled transcripts
        # combined_region = Interval(novel_region[chrom]).interval
        # for region in Interval.overlapwith(combined_region, intron[chrom]):
        # retain all intron regions in this step
        for region in intron[chrom]:
            if len(region) >= 3:
                for intron_info in region[2:]:
                    chrom, start, end = intron_info.split()[:3]
                    intron_region.append([int(start), int(end), intron_info])
                    intron_set.add(intron_info)
        # remove introns overlapped with annotated exons
        combined_region = Interval(excluded_region[chrom]).interval
        for region in Interval.overlapwith(combined_region, intron_region):
            if len(region) >= 3:
                for intron_info in region[2:]:
                    intron_set.discard(intron_info)
    output_f = '%s/all_intron_info.txt' % output_dir
    # import pdb;pdb.set_trace()
    with open(output_f, 'w') as output:
        total_i_n = len(intron_set)
        finished_n = 0
        for intron in intron_set:
            chrom, sta, end, strand = intron.split()
            intron_info = '\t'.join([chrom, sta, end])
            sta = int(sta)
            end = int(end)
            # fetch junctions for circular RNAs
            circ_junc_read = pAminus_junc[intron_info]
            circ_left_read = fetch_read(pAminus_bam, chrom, sta - 8, sta + 8)
            circ_right_read = fetch_read(pAminus_bam, chrom, end - 8, end + 8)
            circ_ri_read = circ_left_read + circ_right_read
            circ_intron_read = fetch_read(pAminus_bam, chrom, sta, end, flag=0)
            # calculate PIR for circular RNAs
            if circ_ri_read == 0 and circ_junc_read == 0:
                pir_circ = 0
            else:
                pir_circ = 100.0 * circ_ri_read / (circ_ri_read +
                                                   2 * circ_junc_read)
            # exact binomial test for circular RNAs
            m = min(circ_left_read, circ_right_read, circ_intron_read)
            n = m + max(circ_left_read, circ_right_read, circ_intron_read)
            p = 1 / 3.5
            p1 = binom.cdf(m, n, p)  # one-side binomial test
            # fetch junctions for linear RNAs
            linear_junc_read = pAplus_junc[intron_info]
            linear_left_read = fetch_read(pAplus_bam, chrom, sta - 8, sta + 8)
            linear_right_read = fetch_read(pAplus_bam, chrom, end - 8, end + 8)
            linear_ri_read = linear_left_read + linear_right_read
            linear_intron_read = fetch_read(pAplus_bam, chrom, sta, end,
                                            flag=0)
            # calculate PIR for linear RNAs
            if linear_ri_read == 0 and linear_junc_read == 0:
                pir_linear = 0
            else:
                pir_linear = 100.0 * linear_ri_read / (linear_ri_read +
                                                       linear_junc_read * 2)
            # exact binomial test for linear RNAs
            m = min(linear_left_read, linear_right_read,
                    linear_intron_read)
            n = m + max(linear_left_read, linear_right_read,
                        linear_intron_read)
            p = 1 / 3.5
            p2 = binom.cdf(m, n, p)  # one-side binomial test
            info = '\t'.join(str(round(float(x), 3))
                             for x in (pir_circ, pir_linear, p1, p2,
                                       circ_ri_read,
                                       circ_junc_read,
                                       circ_intron_read,
                                       linear_ri_read,
                                       linear_junc_read,
                                       linear_intron_read))
            other_info = '\t'.join(intron_info_list[intron])
            output.write('\t'.join([chrom, str(sta), str(end), 'Intron', '0',
                                    strand, other_info, info]))
            output.write('\n')

            finished_n += 1
            sys.stdout.write("Progress: %d/%d   \r" % (finished_n, total_i_n) )
            sys.stdout.flush()
    print('Complete parsing circular RNA introns!')
Exemplo n.º 7
0
def extract_retained_intron(denovo_dir, tophat_dir, pAplus_dir):
    """
    Check each intron and fetch PIR
    Modified from Braunschweig et al., Genome Research, 2014, gr-177790.
    """
    print('Start to parse circular RNA introns...')
    # set path
    fusion_f = '%s/circ_fusion.txt' % denovo_dir
    pAminus_junc_f = tophat_dir + '/junctions.bed'
    pAminus_junc = parse_junc(pAminus_junc_f)
    pAminus_bam_f = tophat_dir + '/accepted_hits.bam'
    pAminus_bam = pysam.AlignmentFile(pAminus_bam_f, 'rb')
    pAplus_junc_f = '%s/junctions.bed' % pAplus_dir
    pAplus_junc = parse_junc(pAplus_junc_f)
    pAplus_bam_f = '%s/accepted_hits.bam' % pAplus_dir
    pAplus_bam = pysam.AlignmentFile(pAplus_bam_f, 'rb')
    excluded_region = defaultdict(list)
    novel_region = defaultdict(list)
    intron = defaultdict(list)
    intron_list = set()
    intron_info_list = {}
    with open(fusion_f, 'r') as f:
        for line in f:
            chrom, start, end = line.split()[:3]
            start = int(start)
            end = int(end)
            strand = line.split()[5]
            circ_type = line.split()[13]
            if circ_type == 'ciRNA':  # not check ciRNAs
                excluded_region[chrom].append([start, end])
                continue
            sizes = [int(x) for x in line.split()[10].split(',')]
            offsets = [int(x) for x in line.split()[11].split(',')]
            reads = line.split()[12]
            gene, iso = line.split()[14:16]
            for s, o in zip(sizes, offsets):
                if gene.startswith('CUFF'):
                    novel_region[chrom].append([start + o, start + o + s])
                else:
                    excluded_region[chrom].append([start + o, start + o + s])
            if gene.startswith('CUFF'):  # only check annotated introns
                continue
            num = int(line.split()[9])
            for i in range(num - 1):
                sta = start + offsets[i] + sizes[i]
                end = start + offsets[i + 1]
                if end - sta == 0:
                    continue
                intron_info = '%s\t%d\t%d\t%s' % (chrom, sta, end, strand)
                if intron_info in intron_list:
                    if int(reads) > int(intron_info_list[intron_info][2]):
                        intron_info_list[intron_info] = [gene, iso, reads]
                    continue
                intron[chrom].append([sta, end, intron_info])
                intron_list.add(intron_info)
                intron_info_list[intron_info] = [gene, iso, reads]
    intron_set = set()
    for chrom in excluded_region:
        intron_region = []
        # retain introns covered by novel assembled transcripts
        combined_region = Interval(novel_region[chrom]).interval
        for region in Interval.overlapwith(combined_region, intron[chrom]):
            if len(region) >= 3:
                for intron_info in region[2:]:
                    chrom, start, end = intron_info.split()[:3]
                    intron_region.append([int(start), int(end), intron_info])
                    intron_set.add(intron_info)
        # remove introns overlapped with annotated exons
        combined_region = Interval(excluded_region[chrom]).interval
        for region in Interval.overlapwith(combined_region, intron_region):
            if len(region) >= 3:
                for intron_info in region[2:]:
                    intron_set.discard(intron_info)
    output_f = '%s/all_intron_info.txt' % denovo_dir
    with open(output_f, 'w') as output:
        for intron in intron_set:
            chrom, sta, end, strand = intron.split()
            intron_info = '\t'.join([chrom, sta, end])
            sta = int(sta)
            end = int(end)
            # fetch junctions for circular RNAs
            circ_junc_read = pAminus_junc[intron_info]
            circ_left_read = fetch_read(pAminus_bam, chrom, sta - 8, sta + 8)
            circ_right_read = fetch_read(pAminus_bam, chrom, end - 8, end + 8)
            circ_ri_read = circ_left_read + circ_right_read
            circ_intron_read = fetch_read(pAminus_bam, chrom, sta, end, flag=0)
            # calculate PIR for circular RNAs
            if circ_ri_read == 0 and circ_junc_read == 0:
                pir_circ = 0
            else:
                pir_circ = 100.0 * circ_ri_read / (circ_ri_read +
                                                   2 * circ_junc_read)
            # exact binomial test for circular RNAs
            m = min(circ_left_read, circ_right_read, circ_intron_read)
            n = m + max(circ_left_read, circ_right_read, circ_intron_read)
            p = 1 / 3.5
            p1 = binom.cdf(m, n, p)  # one-side binomial test
            # fetch junctions for linear RNAs
            linear_junc_read = pAplus_junc[intron_info]
            linear_left_read = fetch_read(pAplus_bam, chrom, sta - 8, sta + 8)
            linear_right_read = fetch_read(pAplus_bam, chrom, end - 8, end + 8)
            linear_ri_read = linear_left_read + linear_right_read
            linear_intron_read = fetch_read(pAplus_bam, chrom, sta, end,
                                            flag=0)
            # calculate PIR for linear RNAs
            if linear_ri_read == 0 and linear_junc_read == 0:
                pir_linear = 0
            else:
                pir_linear = 100.0 * linear_ri_read / (linear_ri_read +
                                                       linear_junc_read * 2)
            # exact binomial test for linear RNAs
            m = min(linear_left_read, linear_right_read,
                    linear_intron_read)
            n = m + max(linear_left_read, linear_right_read,
                        linear_intron_read)
            p = 1 / 3.5
            p2 = binom.cdf(m, n, p)  # one-side binomial test
            info = '\t'.join(str(round(x, 3))
                             for x in (pir_circ, pir_linear, p1, p2,
                                       circ_ri_read,
                                       circ_junc_read,
                                       circ_intron_read,
                                       linear_ri_read,
                                       linear_junc_read,
                                       linear_intron_read))
            other_info = '\t'.join(intron_info_list[intron])
            output.write('\t'.join([chrom, str(sta), str(end), 'Intron', '0',
                                    strand, other_info, info]))
            output.write('\n')
    print('Complete parsing circular RNA introns!')