def ref_filter(ref, tophat_dir, cufflinks_dir): ''' Extract isoform with at least two supported junction reads for each junction ''' print('Read gene annotations...') # read junction information junction_f = tophat_dir + '/junctions.bed' junc = parse_junc(junction_f) print('Filter gene annotations with junction information...') # filter out gene annotations using junction reads filtered_junction_f = '%s/filtered_junction.txt' % cufflinks_dir with open(ref, 'r') as ref_f, open(filtered_junction_f, 'w') as out_f: for line in ref_f: chrom = line.split()[2] starts = line.split()[10].rstrip(',').split(',')[:-1] ends = line.split()[9].rstrip(',').split(',')[1:] for s, e in zip(starts, ends): junc_id = '%s\t%s\t%s' % (chrom, s, e) if junc[junc_id] < 2: break else: # all the junctions have enough reads out_f.write('\t'.join(line.split()[1:]) + '\n') filtered_junction_gtf = '%s/filtered_junction.gtf' % cufflinks_dir return_code = os.system('genePredToGtf file %s %s' % (filtered_junction_f, filtered_junction_gtf)) >> 8 if return_code: sys.exit('Error: cannot convert GenePred to GTF!')
def parse_splice_site(denovo_dir, tophat_dir, pAplus_dir, output_dir): """ Characterize all the alternative splice site selections. """ print('Start to parse alternative splice sites...') splice_site_5 = set() splice_site_3 = set() # set path fusion_f = '%s/circularRNA_full.txt' % denovo_dir pAminus_junc_f = tophat_dir + '/junctions.bed' (pAminus_junc, pAminus_left_junc, pAminus_right_junc) = parse_junc(pAminus_junc_f, 2) pAplus_junc_f = '%s/junctions.bed' % pAplus_dir (pAplus_junc, pAplus_left_junc, pAplus_right_junc) = parse_junc(pAplus_junc_f, 2) with open(fusion_f, 'r') as f: for line in f: circ_type = line.split()[13] if circ_type == 'ciRNA': # not check ciRNAs continue chrom = line.split()[0] start = int(line.split()[1]) strand = line.split()[5] sizes = [int(x) for x in line.split()[10].split(',')] offsets = [int(x) for x in line.split()[11].split(',')] starts, ends = [], [] for s, o in zip(sizes, offsets): starts.append(start + o) ends.append(start + o + s) starts = starts[1:] ends = ends[:-1] for s, e in zip(ends, starts): loc = '%s\t%d\t%d' % (chrom, s, e) left_id = '%s\t%d' % (chrom, s) right_id = '%s\t%d' % (chrom, e) if loc in pAminus_junc: pAminus_reads = pAminus_junc[loc] else: # circ_pcu=0 continue if loc in pAplus_junc: pAplus_reads = pAplus_junc[loc] else: pAplus_reads = 0 if pAminus_left_junc[left_id] != 0: pAminus_left_total = pAminus_left_junc[left_id] pAminus_left_psu = (pAminus_reads * 100.0 / pAminus_left_total) pAplus_left_total = pAplus_left_junc[left_id] if pAplus_left_total != 0: pAplus_left_psu = (pAplus_reads * 100.0 / pAplus_left_total) else: pAplus_left_psu = 0.0 else: # circ_left_total_reads=0 continue if pAminus_right_junc[right_id] != 0: pAminus_right_total = pAminus_right_junc[right_id] pAminus_right_psu = (pAminus_reads * 100.0 / pAminus_right_total) pAplus_right_total = pAplus_right_junc[right_id] if pAplus_right_total != 0: pAplus_right_psu = (pAplus_reads * 100.0 / pAplus_right_total) else: pAplus_right_psu = 0.0 else: # circ_right_total_reads=0 continue f = '%s\t%s\t%d\t%d\t%f\t%d\t%d\t%f\n' left_info = f % (loc, strand, pAminus_reads, pAminus_left_total, pAminus_left_psu, pAplus_reads, pAplus_left_total, pAplus_left_psu) right_info = f % (loc, strand, pAminus_reads, pAminus_right_total, pAminus_right_psu, pAplus_reads, pAplus_right_total, pAplus_right_psu) if strand == '+': if pAminus_left_psu != 100: splice_site_3.add(left_info) if pAminus_right_psu != 100: splice_site_5.add(right_info) else: if pAminus_left_psu != 100: splice_site_5.add(left_info) if pAminus_right_psu != 100: splice_site_3.add(right_info) output_f = '%s/all_A5SS_info.txt' % output_dir with open(output_f, 'w') as output: output.write(''.join(splice_site_5)) output_f = '%s/all_A3SS_info.txt' % output_dir with open(output_f, 'w') as output: output.write(''.join(splice_site_3)) print('Complete parsing alternative splice sites!')
def extract_retained_intron(denovo_dir, tophat_dir, pAplus_dir, output_dir): """ Check each intron and fetch PIR Modified from Braunschweig et al., Genome Research, 2014, gr-177790. """ print('Start to parse circular RNA introns...') # set path fusion_f = '%s/circularRNA_full.txt' % denovo_dir pAminus_junc_f = tophat_dir + '/junctions.bed' pAminus_junc = parse_junc(pAminus_junc_f) pAminus_bam_f = tophat_dir + '/accepted_hits.bam' pAminus_bam = pysam.AlignmentFile(pAminus_bam_f, 'rb') pAplus_junc_f = '%s/junctions.bed' % pAplus_dir pAplus_junc = parse_junc(pAplus_junc_f) pAplus_bam_f = '%s/accepted_hits.bam' % pAplus_dir pAplus_bam = pysam.AlignmentFile(pAplus_bam_f, 'rb') excluded_region = defaultdict(list) novel_region = defaultdict(list) intron = defaultdict(list) intron_list = set() intron_info_list = {} with open(fusion_f, 'r') as f: for line in f: chrom, start, end = line.split()[:3] start = int(start) end = int(end) strand = line.split()[5] circ_type = line.split()[13] if circ_type == 'ciRNA': # not check ciRNAs excluded_region[chrom].append([start, end]) continue sizes = [int(x) for x in line.split()[10].split(',')] offsets = [int(x) for x in line.split()[11].split(',')] reads = line.split()[12] gene, iso = line.split()[14:16] for s, o in zip(sizes, offsets): if gene.startswith('CUFF'): novel_region[chrom].append([start + o, start + o + s]) else: excluded_region[chrom].append([start + o, start + o + s]) if gene.startswith('CUFF'): # only check annotated introns continue num = int(line.split()[9]) for i in range(num - 1): sta = start + offsets[i] + sizes[i] end = start + offsets[i + 1] if end - sta == 0: continue intron_info = '%s\t%d\t%d\t%s' % (chrom, sta, end, strand) if intron_info in intron_list: if int(reads) > int(intron_info_list[intron_info][2]): intron_info_list[intron_info] = [gene, iso, reads] continue intron[chrom].append([sta, end, intron_info]) intron_list.add(intron_info) intron_info_list[intron_info] = [gene, iso, reads] intron_set = set() for chrom in excluded_region: intron_region = [] # retain introns covered by novel assembled transcripts # combined_region = Interval(novel_region[chrom]).interval # for region in Interval.overlapwith(combined_region, intron[chrom]): # retain all intron regions in this step for region in intron[chrom]: if len(region) >= 3: for intron_info in region[2:]: chrom, start, end = intron_info.split()[:3] intron_region.append([int(start), int(end), intron_info]) intron_set.add(intron_info) # remove introns overlapped with annotated exons combined_region = Interval(excluded_region[chrom]).interval for region in Interval.overlapwith(combined_region, intron_region): if len(region) >= 3: for intron_info in region[2:]: intron_set.discard(intron_info) output_f = '%s/all_intron_info.txt' % output_dir # import pdb;pdb.set_trace() with open(output_f, 'w') as output: total_i_n = len(intron_set) finished_n = 0 for intron in intron_set: chrom, sta, end, strand = intron.split() intron_info = '\t'.join([chrom, sta, end]) sta = int(sta) end = int(end) # fetch junctions for circular RNAs circ_junc_read = pAminus_junc[intron_info] circ_left_read = fetch_read(pAminus_bam, chrom, sta - 8, sta + 8) circ_right_read = fetch_read(pAminus_bam, chrom, end - 8, end + 8) circ_ri_read = circ_left_read + circ_right_read circ_intron_read = fetch_read(pAminus_bam, chrom, sta, end, flag=0) # calculate PIR for circular RNAs if circ_ri_read == 0 and circ_junc_read == 0: pir_circ = 0 else: pir_circ = 100.0 * circ_ri_read / (circ_ri_read + 2 * circ_junc_read) # exact binomial test for circular RNAs m = min(circ_left_read, circ_right_read, circ_intron_read) n = m + max(circ_left_read, circ_right_read, circ_intron_read) p = 1 / 3.5 p1 = binom.cdf(m, n, p) # one-side binomial test # fetch junctions for linear RNAs linear_junc_read = pAplus_junc[intron_info] linear_left_read = fetch_read(pAplus_bam, chrom, sta - 8, sta + 8) linear_right_read = fetch_read(pAplus_bam, chrom, end - 8, end + 8) linear_ri_read = linear_left_read + linear_right_read linear_intron_read = fetch_read(pAplus_bam, chrom, sta, end, flag=0) # calculate PIR for linear RNAs if linear_ri_read == 0 and linear_junc_read == 0: pir_linear = 0 else: pir_linear = 100.0 * linear_ri_read / (linear_ri_read + linear_junc_read * 2) # exact binomial test for linear RNAs m = min(linear_left_read, linear_right_read, linear_intron_read) n = m + max(linear_left_read, linear_right_read, linear_intron_read) p = 1 / 3.5 p2 = binom.cdf(m, n, p) # one-side binomial test info = '\t'.join(str(round(float(x), 3)) for x in (pir_circ, pir_linear, p1, p2, circ_ri_read, circ_junc_read, circ_intron_read, linear_ri_read, linear_junc_read, linear_intron_read)) other_info = '\t'.join(intron_info_list[intron]) output.write('\t'.join([chrom, str(sta), str(end), 'Intron', '0', strand, other_info, info])) output.write('\n') finished_n += 1 sys.stdout.write("Progress: %d/%d \r" % (finished_n, total_i_n) ) sys.stdout.flush() print('Complete parsing circular RNA introns!')
def extract_cassette_exon(denovo_dir, tophat_dir, pAplus_dir, output_dir, rpkm_flag): """ 1. Check each exon and fetch PSI 2. Calculate RPKM if needed Modified from Han et al., Nature, 2013, 498:241-245. """ print('Start to parse circular RNA exons...') exons = {} # set path fusion_f = '%s/circularRNA_full.txt' % denovo_dir pAminus_junc_f = tophat_dir + '/junctions.bed' (pAminus_junc, pAminus_left_junc, pAminus_right_junc) = parse_junc(pAminus_junc_f, 1) pAplus_junc_f = '%s/junctions.bed' % pAplus_dir (pAplus_junc, pAplus_left_junc, pAplus_right_junc) = parse_junc(pAplus_junc_f, 1) if rpkm_flag: pAminus_bam = Expression('%s/accepted_hits.bam' % tophat_dir) pAplus_bam = Expression('%s/accepted_hits.bam' % pAplus_dir) with open(fusion_f, 'r') as f: for line in f: circ_type = line.split()[13] if circ_type == 'ciRNA': # not check ciRNAs continue reads = line.split()[12] chrom = line.split()[0] start = int(line.split()[1]) strand = line.split()[5] sizes = [int(x) for x in line.split()[10].split(',')] offsets = [int(x) for x in line.split()[11].split(',')] gene, iso = line.split()[14:16] exon_deque = deque(maxlen=3) # set exon sliding window for s, o in zip(sizes, offsets): sta = start + o end = start + o + s exon_id = [sta, end] exon_deque.append(exon_id) gene_info = '\t'.join([strand, gene, iso]) if len(exon_deque) == 3: # only check middle exon exon_info = '%s\t%d\t%d' % (chrom, exon_deque[1][0], exon_deque[1][1]) if exon_info in exons: if exons[exon_info][0].find('CUFF'): if not gene.startswith('CUFF'): # annotated exon exons[exon_info][0] = gene_info if int(reads) > int(exons[exon_info][1]): # more reads exons[exon_info][1] = reads else: # fetch junctions for circular RNAs (psi_circ, inclusion_circ, exclusion_circ, max_left_circ, max_right_circ) = fetch_psi(exon_info, pAminus_junc, pAminus_left_junc, pAminus_right_junc) if max_left_circ == 'None' or max_left_circ == 'None': flag = [] else: flag = [max_left_circ, max_right_circ] # fetch junctions for linear RNAs (psi_linear, inclusion_linear, exclusion_linear) = fetch_psi(exon_info, pAplus_junc, pAplus_left_junc, pAplus_right_junc, flag) # fisher exact test (circular > linear) odd1, p1 = fisher_exact([[inclusion_circ, 2 * exclusion_circ], [inclusion_linear, 2 * exclusion_linear]], alternative='greater') # fisher exact test (circular < linear) odd2, p2 = fisher_exact([[inclusion_circ, 2 * exclusion_circ], [inclusion_linear, 2 * exclusion_linear]], alternative='less') info = '\t'.join(str(round(float(x), 3)) for x in (psi_circ, psi_linear, p1, p2, inclusion_circ, exclusion_circ, inclusion_linear, exclusion_linear)) if rpkm_flag: circ_exp = pAminus_bam.rpkm(chrom, *exon_deque[1]) linear_exp = pAplus_bam.rpkm(chrom, *exon_deque[1]) info += '\t%.3f\t%.3f' % (circ_exp, linear_exp) exons[exon_info] = [gene_info, reads, info] output_f = '%s/all_exon_info.txt' % output_dir with open(output_f, 'w') as output: for exon in exons: chrom, start, end = exon.split() output.write('\t'.join([chrom, start, end, 'Exon', '0'])) output.write('\t' + '\t'.join(exons[exon])) output.write('\n') print('Complete parsing circular RNA exons!')
def parse_splice_site(denovo_dir, tophat_dir, pAplus_dir): """ Characterize all the alternative splice site selections. """ print('Start to parse alternative splice sites...') splice_site_5 = set() splice_site_3 = set() # set path fusion_f = '%s/circ_fusion.txt' % denovo_dir pAminus_junc_f = tophat_dir + '/junctions.bed' (pAminus_junc, pAminus_left_junc, pAminus_right_junc) = parse_junc(pAminus_junc_f, 2) pAplus_junc_f = '%s/junctions.bed' % pAplus_dir (pAplus_junc, pAplus_left_junc, pAplus_right_junc) = parse_junc(pAplus_junc_f, 2) with open(fusion_f, 'r') as f: for line in f: circ_type = line.split()[13] if circ_type == 'ciRNA': # not check ciRNAs continue chrom = line.split()[0] start = int(line.split()[1]) strand = line.split()[5] sizes = [int(x) for x in line.split()[10].split(',')] offsets = [int(x) for x in line.split()[11].split(',')] starts, ends = [], [] for s, o in zip(sizes, offsets): starts.append(start + o) ends.append(start + o + s) starts = starts[1:] ends = ends[:-1] for s, e in zip(ends, starts): loc = '%s\t%d\t%d' % (chrom, s, e) left_id = '%s\t%d' % (chrom, s) right_id = '%s\t%d' % (chrom, e) if loc in pAminus_junc: pAminus_reads = pAminus_junc[loc] else: # circ_pcu=0 continue if loc in pAplus_junc: pAplus_reads = pAplus_junc[loc] else: pAplus_reads = 0 if pAminus_left_junc[left_id] != 0: pAminus_left_total = pAminus_left_junc[left_id] pAminus_left_psu = (pAminus_reads * 100.0 / pAminus_left_total) pAplus_left_total = pAplus_left_junc[left_id] if pAplus_left_total != 0: pAplus_left_psu = (pAplus_reads * 100.0 / pAplus_left_total) else: pAplus_left_psu = 0.0 else: # circ_left_total_reads=0 continue if pAminus_right_junc[right_id] != 0: pAminus_right_total = pAminus_right_junc[right_id] pAminus_right_psu = (pAminus_reads * 100.0 / pAminus_right_total) pAplus_right_total = pAplus_right_junc[right_id] if pAplus_right_total != 0: pAplus_right_psu = (pAplus_reads * 100.0 / pAplus_right_total) else: pAplus_right_psu = 0.0 else: # circ_right_total_reads=0 continue f = '%s\t%s\t%d\t%d\t%f\t%d\t%d\t%f\n' left_info = f % (loc, strand, pAminus_reads, pAminus_left_total, pAminus_left_psu, pAplus_reads, pAplus_left_total, pAplus_left_psu) right_info = f % (loc, strand, pAminus_reads, pAminus_right_total, pAminus_right_psu, pAplus_reads, pAplus_right_total, pAplus_right_psu) if strand == '+': if pAminus_left_psu != 100: splice_site_3.add(left_info) if pAminus_right_psu != 100: splice_site_5.add(right_info) else: if pAminus_left_psu != 100: splice_site_5.add(left_info) if pAminus_right_psu != 100: splice_site_3.add(right_info) output_f = '%s/all_A5SS_info.txt' % denovo_dir with open(output_f, 'w') as output: output.write(''.join(splice_site_5)) output_f = '%s/all_A3SS_info.txt' % denovo_dir with open(output_f, 'w') as output: output.write(''.join(splice_site_3)) print('Complete parsing alternative splice sites!')
def extract_retained_intron(denovo_dir, tophat_dir, pAplus_dir): """ Check each intron and fetch PIR Modified from Braunschweig et al., Genome Research, 2014, gr-177790. """ print('Start to parse circular RNA introns...') # set path fusion_f = '%s/circ_fusion.txt' % denovo_dir pAminus_junc_f = tophat_dir + '/junctions.bed' pAminus_junc = parse_junc(pAminus_junc_f) pAminus_bam_f = tophat_dir + '/accepted_hits.bam' pAminus_bam = pysam.AlignmentFile(pAminus_bam_f, 'rb') pAplus_junc_f = '%s/junctions.bed' % pAplus_dir pAplus_junc = parse_junc(pAplus_junc_f) pAplus_bam_f = '%s/accepted_hits.bam' % pAplus_dir pAplus_bam = pysam.AlignmentFile(pAplus_bam_f, 'rb') excluded_region = defaultdict(list) novel_region = defaultdict(list) intron = defaultdict(list) intron_list = set() intron_info_list = {} with open(fusion_f, 'r') as f: for line in f: chrom, start, end = line.split()[:3] start = int(start) end = int(end) strand = line.split()[5] circ_type = line.split()[13] if circ_type == 'ciRNA': # not check ciRNAs excluded_region[chrom].append([start, end]) continue sizes = [int(x) for x in line.split()[10].split(',')] offsets = [int(x) for x in line.split()[11].split(',')] reads = line.split()[12] gene, iso = line.split()[14:16] for s, o in zip(sizes, offsets): if gene.startswith('CUFF'): novel_region[chrom].append([start + o, start + o + s]) else: excluded_region[chrom].append([start + o, start + o + s]) if gene.startswith('CUFF'): # only check annotated introns continue num = int(line.split()[9]) for i in range(num - 1): sta = start + offsets[i] + sizes[i] end = start + offsets[i + 1] if end - sta == 0: continue intron_info = '%s\t%d\t%d\t%s' % (chrom, sta, end, strand) if intron_info in intron_list: if int(reads) > int(intron_info_list[intron_info][2]): intron_info_list[intron_info] = [gene, iso, reads] continue intron[chrom].append([sta, end, intron_info]) intron_list.add(intron_info) intron_info_list[intron_info] = [gene, iso, reads] intron_set = set() for chrom in excluded_region: intron_region = [] # retain introns covered by novel assembled transcripts combined_region = Interval(novel_region[chrom]).interval for region in Interval.overlapwith(combined_region, intron[chrom]): if len(region) >= 3: for intron_info in region[2:]: chrom, start, end = intron_info.split()[:3] intron_region.append([int(start), int(end), intron_info]) intron_set.add(intron_info) # remove introns overlapped with annotated exons combined_region = Interval(excluded_region[chrom]).interval for region in Interval.overlapwith(combined_region, intron_region): if len(region) >= 3: for intron_info in region[2:]: intron_set.discard(intron_info) output_f = '%s/all_intron_info.txt' % denovo_dir with open(output_f, 'w') as output: for intron in intron_set: chrom, sta, end, strand = intron.split() intron_info = '\t'.join([chrom, sta, end]) sta = int(sta) end = int(end) # fetch junctions for circular RNAs circ_junc_read = pAminus_junc[intron_info] circ_left_read = fetch_read(pAminus_bam, chrom, sta - 8, sta + 8) circ_right_read = fetch_read(pAminus_bam, chrom, end - 8, end + 8) circ_ri_read = circ_left_read + circ_right_read circ_intron_read = fetch_read(pAminus_bam, chrom, sta, end, flag=0) # calculate PIR for circular RNAs if circ_ri_read == 0 and circ_junc_read == 0: pir_circ = 0 else: pir_circ = 100.0 * circ_ri_read / (circ_ri_read + 2 * circ_junc_read) # exact binomial test for circular RNAs m = min(circ_left_read, circ_right_read, circ_intron_read) n = m + max(circ_left_read, circ_right_read, circ_intron_read) p = 1 / 3.5 p1 = binom.cdf(m, n, p) # one-side binomial test # fetch junctions for linear RNAs linear_junc_read = pAplus_junc[intron_info] linear_left_read = fetch_read(pAplus_bam, chrom, sta - 8, sta + 8) linear_right_read = fetch_read(pAplus_bam, chrom, end - 8, end + 8) linear_ri_read = linear_left_read + linear_right_read linear_intron_read = fetch_read(pAplus_bam, chrom, sta, end, flag=0) # calculate PIR for linear RNAs if linear_ri_read == 0 and linear_junc_read == 0: pir_linear = 0 else: pir_linear = 100.0 * linear_ri_read / (linear_ri_read + linear_junc_read * 2) # exact binomial test for linear RNAs m = min(linear_left_read, linear_right_read, linear_intron_read) n = m + max(linear_left_read, linear_right_read, linear_intron_read) p = 1 / 3.5 p2 = binom.cdf(m, n, p) # one-side binomial test info = '\t'.join(str(round(x, 3)) for x in (pir_circ, pir_linear, p1, p2, circ_ri_read, circ_junc_read, circ_intron_read, linear_ri_read, linear_junc_read, linear_intron_read)) other_info = '\t'.join(intron_info_list[intron]) output.write('\t'.join([chrom, str(sta), str(end), 'Intron', '0', strand, other_info, info])) output.write('\n') print('Complete parsing circular RNA introns!')
def extract_cassette_exon(denovo_dir, tophat_dir, pAplus_dir, rpkm_flag): """ 1. Check each exon and fetch PSI 2. Calculate RPKM if needed Modified from Han et al., Nature, 2013, 498:241-245. """ print('Start to parse circular RNA exons...') exons = {} # set path fusion_f = '%s/circ_fusion.txt' % denovo_dir pAminus_junc_f = tophat_dir + '/junctions.bed' (pAminus_junc, pAminus_left_junc, pAminus_right_junc) = parse_junc(pAminus_junc_f, 1) pAplus_junc_f = '%s/junctions.bed' % pAplus_dir (pAplus_junc, pAplus_left_junc, pAplus_right_junc) = parse_junc(pAplus_junc_f, 1) if rpkm_flag: pAminus_bam = Expression('%s/accepted_hits.bam' % tophat_dir) pAplus_bam = Expression('%s/accepted_hits.bam' % pAplus_dir) with open(fusion_f, 'r') as f: for line in f: circ_type = line.split()[13] if circ_type == 'ciRNA': # not check ciRNAs continue reads = line.split()[12] chrom = line.split()[0] start = int(line.split()[1]) strand = line.split()[5] sizes = [int(x) for x in line.split()[10].split(',')] offsets = [int(x) for x in line.split()[11].split(',')] gene, iso = line.split()[14:16] exon_deque = deque(maxlen=3) # set exon sliding window for s, o in zip(sizes, offsets): sta = start + o end = start + o + s exon_id = [sta, end] exon_deque.append(exon_id) gene_info = '\t'.join([strand, gene, iso]) if len(exon_deque) == 3: # only check middle exon exon_info = '%s\t%d\t%d' % (chrom, exon_deque[1][0], exon_deque[1][1]) if exon_info in exons: if exons[exon_info][0].find('CUFF'): if not gene.startswith('CUFF'): # annotated exon exons[exon_info][0] = gene_info if int(reads) > int(exons[exon_info][1]): # more reads exons[exon_info][1] = reads else: # fetch junctions for circular RNAs (psi_circ, inclusion_circ, exclusion_circ, max_left_circ, max_right_circ) = fetch_psi(exon_info, pAminus_junc, pAminus_left_junc, pAminus_right_junc) if max_left_circ == 'None' or max_left_circ == 'None': flag = [] else: flag = [max_left_circ, max_right_circ] # fetch junctions for linear RNAs (psi_linear, inclusion_linear, exclusion_linear) = fetch_psi(exon_info, pAplus_junc, pAplus_left_junc, pAplus_right_junc, flag) # fisher exact test (circular > linear) odd1, p1 = fisher_exact([[inclusion_circ, 2 * exclusion_circ], [inclusion_linear, 2 * exclusion_linear]], alternative='greater') # fisher exact test (circular < linear) odd2, p2 = fisher_exact([[inclusion_circ, 2 * exclusion_circ], [inclusion_linear, 2 * exclusion_linear]], alternative='less') info = '\t'.join(str(round(x, 3)) for x in (psi_circ, psi_linear, p1, p2, inclusion_circ, exclusion_circ, inclusion_linear, exclusion_linear)) if rpkm_flag: circ_exp = pAminus_bam.rpkm(chrom, *exon_deque[1]) linear_exp = pAplus_bam.rpkm(chrom, *exon_deque[1]) info += '\t%.3f\t%.3f' % (circ_exp, linear_exp) exons[exon_info] = [gene_info, reads, info] output_f = '%s/all_exon_info.txt' % denovo_dir with open(output_f, 'w') as output: for exon in exons: chrom, start, end = exon.split() output.write('\t'.join([chrom, start, end, 'Exon', '0'])) output.write('\t' + '\t'.join(exons[exon])) output.write('\n') print('Complete parsing circular RNA exons!')