def main(): description = "Record the distribution of peaks for different exons." args = hk.parse_arguments(description, ["peaks_file", "gtf", "exon_starts_file", "output_file", "reads_file", "from_end", "intronic", "limit", "nts_before_start", "noncoding", "reads_mode"], flags = [5, 6, 9, 10], ints = [7, 8]) peaks_file, gtf, exon_starts_file, output_file, reads_file, from_end, intronic, limit, nts_before_start, noncoding, reads_mode = args.peaks_file, args.gtf, args.exon_starts_file, args.output_file, args.reads_file, args.from_end, args.intronic, args.limit, args.nts_before_start, args.noncoding, args.reads_mode if noncoding: exons = rw.read_gtf(gtf, "exon", gene=False) else: exons = rw.read_gtf(gtf, "CDS", gene=False) # the 3' ss that will be analyzed valid_junctions = rw.read_many_fields(exon_starts_file, "\t") # pull out the column with transcript IDs valid_junctions = [i[3] for i in valid_junctions] lengths_dict = co.get_lengths(exons, valid_junctions, intronic=intronic) if nts_before_start: lengths_dict = {i: lengths_dict[i] + nts_before_start for i in lengths_dict} coverage_file_name = "{0}_{1}_coverage.bed".format(exon_starts_file[:-4], reads_file.split("/")[-1][:-4]) co.get_coverage(exon_starts_file, reads_file, coverage_file_name) peak_distances_all, peak_centres = co.peak_pos_in_exon(exon_starts_file, peaks_file, from_end = from_end, reads_mode = reads_mode) write_dist_mat(peak_distances_all, limit, output_file, lengths_dict, "{0}_intron_names.txt".format(output_file[:-4]), None) write_dist_mat(peak_centres, limit, "{0}_centres.txt".format(output_file[:-4]), lengths_dict, "{0}_centres_intron_names.txt".format(output_file[:-4]), None)
def main(): description = "Given a BED file of reads, filter out reads whose " \ "3' end maps to the last nucleotide of an intron or" \ "the last nucleotide of an exon." args = hk.parse_arguments(description, ["reads_file", "gtf", "outfile"]) reads_file, gtf, outfile = args.reads_file, args.gtf, args.outfile print("Getting intron lariat positions...") # read in exon coordinates exons = rw.read_gtf(gtf, element="exon", gene=False) # make a BED file with the last positions of introns intron_lariat_bed = "{0}_intron_lariat_pos_all_exons.bed".format(reads_file[:-4]) co.write_intron_lariat_pos_from_exons(exons, intron_lariat_bed, add_chr = True) # intersect the reads with intron lariat positions intron_lariat_intersect_file_name = "{0}_intersect_with_intron_lariat_pos_all_exons.bed".format(reads_file[:-4]) co.intersect_bed(reads_file, intron_lariat_bed, force_strand=True, write_both=True, no_dups=False, output_file=intron_lariat_intersect_file_name) hk.remove_file(intron_lariat_bed) intron_lariat_reads_file = "{0}_intron_lariat_reads_all_exons.bed".format(reads_file[:-4]) # check that the reads end exactly at intron lariat positions check_3prime_match(intron_lariat_intersect_file_name, intron_lariat_reads_file) hk.remove_file(intron_lariat_intersect_file_name) # write BED with the last positions of exons splice_intermediate_bed = "{0}_splice_intermediate_pos_all_exons.bed".format(reads_file[:-4]) co.write_si_pos_from_exons(exons, splice_intermediate_bed, add_chr = True) print("Getting splice intermediate positions.") # intersect the reads with splice intermediate positions splice_intermediate_intersect_file_name = "{0}_intersect_with_SI_pos_all_exons.bed".format(reads_file[:-4]) co.intersect_bed(reads_file, splice_intermediate_bed, force_strand=True, write_both=True, no_dups=False, output_file=splice_intermediate_intersect_file_name) hk.remove_file(splice_intermediate_bed) SI_reads_file = "{0}_SI_reads_all_exons.bed".format(reads_file[:-4]) # check that the reads end exactly at the end of the exon check_3prime_match(splice_intermediate_intersect_file_name, SI_reads_file) hk.remove_file(splice_intermediate_intersect_file_name) print("Concatenating the two files.") # concatenate the IL and SI read files so you could exclude both in one go combined_file = "{0}_SI_and_IL_reads_all_exons.bed".format(reads_file[:-4]) hk.run_process(["cat", SI_reads_file, intron_lariat_reads_file], file_for_output=combined_file) hk.remove_file(SI_reads_file) hk.remove_file(intron_lariat_reads_file) # do an exclusive intersect, requiring 1.0 overlap for both A and B, to remove the # putative intron lariat reads from the main reads file co.intersect_bed(reads_file, combined_file, overlap=1, overlap_rec=1, force_strand=True, no_dups=False, exclude=True, output_file=outfile) hk.remove_file(combined_file)
def test_get_flanking_intron_sizes(self): exons = rw.read_gtf("tests/get_flanking_intron_sizes_input.gtf", "exon", gene=False) expected = {} expected["ENSMUST1"] = { "upstream": [None, 4, 4], "downstream": [4, 4, None] } expected["ENSMUST8"] = None expected["ENSMUST4"] = { "upstream": [None, 1, 4, 2], "downstream": [1, 4, 2, None] } observed = get_flanking_intron_sizes(exons) self.assertEqual(observed, expected)
def test_get_upstream_intron_size(self): exons = rw.read_gtf("tests/get_upstream_intron_size_input.gtf", "exon", gene=False) exon_ranks = { "ENSMUST1.0": 0, "ENSMUST1.1": 1, "ENSMUST4.0": 0, "ENSMUST4.2": 2 } expected = { "ENSMUST1.0": None, "ENSMUST1.1": 4, "ENSMUST4.0": None, "ENSMUST4.2": 4 } observed = get_upstream_intron_size(exons, exon_ranks) self.assertEqual(expected, observed)
def get_transcripts(gtf, out_file, add_chr = False): """ Given a GTF file that has exon coordinates (among others), make an output BED file with transcript coordinates. :param gtf: input GTF file :param out_file: output BED file name :param add_chr: if True, prefix "chr" to chromosome names :return: None """ exons = rw.read_gtf(gtf, "exon", gene = False) with open(out_file, "w") as file: out_writer = csv.writer(file, delimiter = "\t") for trans in sorted(list(exons.keys())): starts = [i[3] for i in exons[trans]] ends = [i[4] for i in exons[trans]] # just any exon to get those fields that will be the same for all of them template = exons[trans][0] if add_chr: chrom = "chr{0}".format(template[0]) else: chrom = template[0] # convert to BED to_write = [chrom, min(starts) - 1, max(ends), trans, ".", template[6]] out_writer.writerow(to_write)
def main(): description = "Call peaks in a BED file of NET-seq reads." help_info = [ "BED file (at least a BED6) with NET-seq reads. Should be single-nucleotide resolution (each BED region is the 3' end of a read.).", "Ensembl GTF file for the relevant species. Ensure that chromosome names are formatted the same way in both the GTF and the BED file with reads!", "BED file with the coordinates of the transcripts to analyze. Only the name field is read, hence the others can hold placeholders. The name field must contain transcript IDs from the GTF file.", "Name of the output file (BED file with peak coordinates).", "Alpha value for calling a position as having a significantly higher local read denisty than expected by chance. Default: 0.01.", "Merge distance: adjacent peaks will be merged if they are closer than this many nucleotides. Default: 21.", "Minimum reads per peak. Default: 10.", "The number of times the read position randomization should be performed for each transcript. Higher values make the significance calculation (marginally) more robust, however, they also make the programme very slow. Default: 5.", "Minimum length of a peak in nucleotides. Default: 5.", "Size of the sliding window to use when calculating the local read density. It may be sensible to set this to the same value as the merge distance. Should be an odd integer. Default: 21", "The analysis will be performed this many times, with the output files numbered. Useful for running many negative control simulations at once. Default: 1.", "Read positions will be shuffled within each transcript before analysis. This should disrupt any signal and should give a flat peak density profile.", "Instead of a sliding window, adjacent non-overlapping windows will be used when calculating the local read density.", "When calling peaks in a given exon/intron, do not include that exon/intron in the read position randomization.", "When --exclude_focal is set, count an exon and its upstream intron as a single unit (except for the first exon).", "Don't filter out likely PCR duplicates (peaks where more than 90%% of the reads come from a single nucleotide position).)" ] defaults = {4: 0.01, 5: 21, 6: 10, 7: 5, 8: 5, 9: 21, 10: 1} args = hk.parse_arguments(description, [ "reads_file", "gtf", "trans_active_file", "output_file", "significance_threshold", "merge", "min_reads_per_peak", "iterations", "min_peak_length", "window_size", "runs", "neg_control", "no_slide", "exclude_focal", "with_ups_intron", "no_PCR_filter" ], floats=[4], ints=[5, 6, 7, 8, 9, 10], flags=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], detailed_help=help_info, defaults=defaults) reads_file, gtf, trans_active_file, output_file, significance_threshold, merge, min_reads_per_peak, iterations, min_peak_length, window_size, runs, neg_control, no_slide, exclude_focal, with_ups_intron, no_PCR_filter = args.reads_file, args.gtf, args.trans_active_file, args.output_file, args.significance_threshold, args.merge, args.min_reads_per_peak, args.iterations, args.min_peak_length, args.window_size, args.runs, args.neg_control, args.no_slide, args.exclude_focal, args.with_ups_intron, args.no_PCR_filter print("Merge distance: {0}".format(merge)) print("Minimum number of reads per peak: {0}".format(min_reads_per_peak)) print("Minimum peak length: {0}".format(min_peak_length)) print("Window size: {0}".format(window_size)) print("Significance level: {0}".format(significance_threshold)) print("Randomization iterations to perform: {0}".format(iterations)) print("Runs: {0}".format(runs)) neg_str = "" if neg_control: neg_str = "_neg_control" slide_str = "" if no_slide: slide_str = "_no_slide" intron_str = "" if with_ups_intron: intron_str = "w_ups_intr" # 0. make a BED file with the coordinates of transcripts transcripts_file = "{0}_transcripts.bed".format(gtf[:-4]) co.get_transcripts(gtf, transcripts_file, add_chr=True) exons = rw.read_gtf(gtf, "exon") # 1. intersect the two files, loop over the result and make a # dictionary of reads per pos for each transcript, which has reads reads_per_pos = get_reads_per_pos(reads_file, transcripts_file) # only leave transcriptionally active genes (one isoform per gene) trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:] # pull out the column with transcript IDs trans_active_genes = [i[3] for i in trans_active_genes] reads_per_pos = { i: reads_per_pos[i] for i in reads_per_pos if i.split(".")[-1] in trans_active_genes } for sim in range(runs): print("**********{0}**********".format(sim)) # 2. for each transcript, randomly reshuffle the reads and calculate the # nth percentile depending on what the significance threshold is # keep positions that are higher than that threshold and write to BED file raw_peak_bed = "{0}_{1}_raw_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format( reads_file[:-4], gtf.split("/")[-1][:-4], iterations, min_reads_per_peak, window_size, neg_str, intron_str, slide_str, sim) read_count_file = "{0}_{1}_read_counts{2}_{3}{4}{5}_{6}_sim.txt".format( reads_file[:-4], gtf.split("/")[-1][:-4], iterations, window_size, neg_str, intron_str, sim) new_reads_file = write_raw_peaks(reads_per_pos, raw_peak_bed, read_count_file, exons, iterations=iterations, min_read_count=min_reads_per_peak, window_size=window_size, neg_control=neg_control, no_slide=no_slide, exclude_focal=exclude_focal, with_ups_intron=with_ups_intron) if neg_control: reads_file = new_reads_file # 3. merge peaks merged_peak_bed = "{0}_{1}_merged_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format( reads_file[:-4], gtf.split("/")[-1][:-4], iterations, window_size, merge, neg_str, slide_str, intron_str, sim) co.merge_bed(raw_peak_bed, merged_peak_bed, merge) print("Before filtering, there are {0} peaks.".format( hk.line_count(merged_peak_bed))) # 4. filter out peaks that don't have enough reads or are too short. # Write final results to file and also write a stats file with the size, # read count and overlapping transcript of the peaks stats_file = "{0}_stats_{1}_sim.txt".format(output_file[:-4], sim) filter_peaks(merged_peak_bed, reads_file, read_count_file, "{0}_{1}_sim.bed".format(output_file[:-4], sim), min_reads_per_peak, min_peak_length, stats_file, no_PCR_filter=no_PCR_filter)
def main(): description = "Prepare a BED file with the TES coordinates of transcriptionally" \ "active genes and make a metagene of reads within this region." args = hk.parse_arguments(description, ["trans_act_file", "gtf", "start_coord", "end_coord", "outname", "reads_file"], ints = [2, 3]) trans_act_file, gtf, start_coord, end_coord, outname, reads_file = args.trans_act_file, args.gtf, args.start_coord, args.end_coord, args.outname, args.reads_file trans_act_genes = [] with open(trans_act_file) as f: reader = csv.reader(f, delimiter = "\t") for line in reader: trans_act_genes.append(line[3]) exons = rw.read_gtf(gtf, "exon") CDSs = rw.read_gtf(gtf, "CDS") exons = {i: exons[i] for i in exons if i in trans_act_genes} # protein-coding only exons = {i: exons[i] for i in exons if i in CDSs} ds_500 = "{0}_ds_500.bed".format(outname[:-4]) with open(outname, "w") as out, open(ds_500, "w") as out_ds: writer = csv.writer(out, delimiter="\t") writer_ds = csv.writer(out_ds, delimiter="\t") for trans in exons: strand = exons[trans][0][6] chrom = "chr{0}".format(exons[trans][0][0]) if strand == "+": TES = exons[trans][-1][4] new_start = TES - start_coord new_end = TES + end_coord new_start_ds = TES new_end_ds = TES + 500 else: TES = exons[trans][-1][3] new_start = TES - start_coord - 1 new_end = TES + start_coord - 1 new_start_ds = TES - 500 - 1 new_end_ds = TES - 1 writer.writerow([chrom, new_start, new_end, trans, "0", strand]) chrom = chrom.lstrip("chr") writer_ds.writerow([chrom, new_start_ds, new_end_ds, trans, "0", strand]) intersect = "{0}_ds500_intersect.bed".format(outname[:-4]) transcripts_file = "{0}_transcripts.bed".format(gtf[:-4]) co.intersect_bed(ds_500, transcripts_file, write_both = True, force_strand=False, no_dups = False, output_file=intersect) co.get_transcripts(gtf, transcripts_file, with_detail=True) mapping = co.transcript_mapping(transcripts_file) to_exclude = [] with open(intersect) as int_file: reader = csv.reader(int_file, delimiter = "\t") for line in reader: strand = line[5] curr_gene = mapping[line[3]] other_gene = mapping[line[9]] if curr_gene != other_gene: to_exclude.append(line[3]) filtered_out_name = "{0}_filt.txt".format(outname[:-4]) with open(filtered_out_name, "w") as filt_f: for name in to_exclude: filt_f.write("{0}\n".format(name)) final_out_name = "{0}_distrib.bed".format(outname[:-4]) distances = co.peak_pos_in_exon(outname, reads_file, from_end = True, reads_mode = True)[0] write_dist_mat(distances, start_coord + end_coord, final_out_name, None, "{0}_names.txt".format(final_out_name[:-4]), None)
def main(): description = "Record splicing distance." args = hk.parse_arguments(description, ["input_file", "gtf", "output_folder", "trans_active_file", "window_size", "intron_window_size", "outsuffix", "leave_terminal"], ints = [4, 5], flags = [7]) input_file, gtf, output_folder, trans_active_file, window_size, intron_window_size, outsuffix, leave_terminal = args.input_file, args.gtf, args.output_folder, args.trans_active_file, args.window_size, args.intron_window_size, args.outsuffix, args.leave_terminal if outsuffix == "None": outsuffix = "" bare_input_path = input_file.split("/")[-1] bed = "{0}.bed".format(input_file[:-4]) # hk.convert2bed(input_file, bed) # get descriptive stats of the reads length_file = "{0}/{1}_read_lengths.txt".format(output_folder, bare_input_path[:-4]) write_read_lengths(bed, length_file) # read in CDS coordinates exons = rw.read_gtf(gtf, "CDS", gene=False) # only leave transcriptionally active genes (one isoform per gene) trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:] # pull out the column with transcript IDs trans_active_genes = [i[3] for i in trans_active_genes] exons = {i: exons[i] for i in exons if i in trans_active_genes} terminal_suff = "_with_terminal" if not leave_terminal: # remove last exons exons = {i: exons[i][:-1] for i in exons} terminal_suff = "" # prepare exon-exon junctions exon_junctions_file = "{0}_exon_junctions{1}{2}.bed".format(gtf[:-4], outsuffix, terminal_suff) all_junctions = co.extract_3ss(exons, exon_junctions_file) out_bed = "{0}/{1}_first_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True) out_bed_end = "{0}/{1}_last_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True) intron_bed = "{0}/{1}_first_{2}_intronic_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], intron_window_size, outsuffix, terminal_suff) write_intron_starts(all_junctions, intron_bed, exons, intron_window_size, add_chr=True) out_bed = "{0}/{1}_first_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True, centre=True) out_bed_end = "{0}/{1}_last_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True, centre=True) out_bed_si = "{0}/{1}_si_pos{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) write_si_pos(all_junctions, out_bed_si, exons, add_chr=True) out_bed_si_current = "{0}/{1}_si_pos_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) write_si_pos(all_junctions, out_bed_si_current, exons, add_chr=True, curr_exon=True) # check which junctions are associated with a splicing intermediate read snr_bed = "{0}_snr.bed".format(bed[:-4]) co.snr_bed(bed, snr_bed) si_counts_bed = "{0}/{1}_si_counts{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) co.intersect_bed(out_bed_si, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_bed) si_counts_current_bed = "{0}/{1}_si_counts_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) co.intersect_bed(out_bed_si_current, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_current_bed) # filter out reads that don't overlap exon-exon junctions exon_junction_bed = "{0}_exon_junctions{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff) co.intersect_bed(bed, exon_junctions_file, write_both=True, output_file=exon_junction_bed, force_strand=True, no_dups=False) spliced_bed = "{0}_spliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff) unspliced_bed = "{0}_unspliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff) sr_distances = {} ur_distances = {} found_count = 0 file_size = hk.line_count(exon_junction_bed) # will store all the intron names for which there are # either spliced or unspliced reads valid_junctions = [] with open(exon_junction_bed) as file, open(spliced_bed, "w") as sfile, open(unspliced_bed, "w") as ufile: for pos, line in enumerate(file): if pos % 100000 == 0: print("{0}/{1}".format(pos, file_size)) print("Found {0} spliced reads.".format(found_count)) print("\n") line = line.split("\t") # reads that end at the last nucleotide of an exon intermediate_read = NGS.check_intermediate_read(line, exons) intron_name = line[20] if not intermediate_read: # check that it ends within the exon just downstream of # the 3' ss that is being analyzed in_dwns_exon = NGS.check_position_in_exon(line, exons) if in_dwns_exon: # 'spliced', 'unspliced' or 'None' (=can't analyze) read_type = NGS.analyze_cigar(line, overhang = 5) if read_type: if intron_name not in valid_junctions: valid_junctions.append(intron_name) splice_dist = NGS.get_splice_dist(line) if read_type == "S": sfile.write("\t".join([str(i) for i in line])) found_count = found_count + 1 sr_distances = update_dist_dict(intron_name, sr_distances, splice_dist) else: ufile.write("\t".join([str(i) for i in line])) ur_distances = update_dist_dict(intron_name, ur_distances, splice_dist) print("Proportion of spliced reads: {0}.".format(found_count/(pos + 1))) # for each valid junction, calculate the length of the exonic sequence # afterwards, so that you wouldn't consider intronic sequence in the distance # matrix lengths_dict = co.get_lengths(exons, valid_junctions) write_dist_mat(sr_distances, window_size, "{0}/{1}_spliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), lengths_dict, "{0}/{1}_spliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), "{0}/{1}_spliced_read_first_spliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)) write_dist_mat(ur_distances, window_size, "{0}/{1}_unspliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), lengths_dict, "{0}/{1}_unspliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), "{0}/{1}_unspliced_read_first_unspliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff))
def main(): description = "Aggregate various statistics on the splicing events you're studying." args = hk.parse_arguments(description, [ "gtf", "polII_bed", "exon_start_coords", "truncated_exons_file", "genome_file", "output_file" ]) gtf, polII_bed, exon_start_coords, truncated_exons_file, genome_file, output_file = args.gtf, args.polII_bed, args.exon_start_coords, args.truncated_exons_file, args.genome_file, args.output_file CDSs = rw.read_gtf(gtf, "CDS", gene=False) exons = rw.read_gtf(gtf, "exon", gene=False) exon_starts = rw.read_many_fields(exon_start_coords, skip_header=False, delimiter="\t") exon_starts = {i[3]: i for i in exon_starts} out_array = np.array(sorted(exon_starts.keys()), dtype="str") out_array.shape = (len(exon_starts.keys()), 1) out_array = np.vstack((["junction"], out_array)) #1. exon size curr_dict = co.get_lengths(CDSs, exon_starts.keys()) out_array = add_to_array(out_array, curr_dict, "exon_size") print("Exon size done.") #2. exon number curr_dict = co.get_exon_number(exons, exon_starts.keys()) out_array = add_to_array(out_array, curr_dict, "exon_number") print("Exon number done.") #3. exon rank (from start and end) exon_rank_start, exon_rank_end = co.get_exon_rank(exons, exon_starts) out_array = add_to_array(out_array, exon_rank_start, "exon_rank_from_start") out_array = add_to_array(out_array, exon_rank_end, "exon_rank_from_end") print("Exon rank done.") #4. upstream intron size curr_dict = co.get_upstream_intron_size(exons, exon_rank_start) out_array = add_to_array(out_array, curr_dict, "upstream_intron_size") curr_dict = co.get_upstream_intron_size(exons, exon_rank_start, downstream=True) out_array = add_to_array(out_array, curr_dict, "downstream_intron_size") print("Intron size done.") if truncated_exons_file != "None": #5. Pol II density per transcript dens_per_trans_file = "{0}_dens_per_trans.txt".format(polII_bed[:-4]) dens_per_trans_junctions = get_dens_per_trans(truncated_exons_file, polII_bed, dens_per_trans_file, out_array[1:, 0]) out_array = add_to_array(out_array, dens_per_trans_junctions, "polII_dens_per_trans") print("Pol II density done.") #6. exon GC4 and GC content genome = Fasta(genome_file) curr_dict = get_exon_GC4(CDSs, exons, exon_rank_start, genome) out_array = add_to_array(out_array, curr_dict, "exon_GC4") curr_dict = get_exon_GC(exons, exon_rank_start, genome) out_array = add_to_array(out_array, curr_dict, "exon_GC") print("Exon GC done.") #7. upstream intron GC content curr_dict = get_upstream_intron_GC(exons, exon_rank_start, genome) out_array = add_to_array(out_array, curr_dict, "upstream_intron_GC") print("Intron GC done.") #8. splice site strength curr_dict = nc.get_ss_strength(exons, genome_file, upstream=True, five=True, exonic=3, intronic=6) out_array = add_to_array(out_array, curr_dict, "upstream_5ss_strength") curr_dict = nc.get_ss_strength(exons, genome_file, upstream=True, five=False, exonic=3, intronic=20) out_array = add_to_array(out_array, curr_dict, "upstream_3ss_strength") curr_dict = nc.get_ss_strength(exons, genome_file, upstream=False, five=True, exonic=3, intronic=6) out_array = add_to_array(out_array, curr_dict, "downstream_5ss_strength") print("Splice site strength done.") with open(output_file, "w") as file: for line in range(0, out_array.shape[0]): line = out_array[line, :] line = "\t".join([str(i) for i in line]) file.write(line) file.write("\n")