def buildAnnotDB(db, chr2gtf_lines, db_name, use_gene_name): for chr in chr2gtf_lines: # Dictionary to hold information before creating exon entries. # Dictionaries will be of the form {transcript_id: [gff_obj]} five_utr_dict = {} three_utr_dict = {} start_codon_dict = {} stop_codon_dict = {} cds_dict = {} exon_dict = {} # Holds transcript and strand information separately # {transcript_id: strand} transcript_id2strand = {} for line in chr2gtf_lines[chr]: gff_obj = GFF_Record(line) transcript_id = get_transcript_id(gff_obj) # Update transcript_id2strand dictionary if not transcript_id in transcript_id2strand: transcript_id2strand[transcript_id] = gff_obj.strand # Update all feature dictionaries # if gff_obj.feature == '5UTR': # updateDictOfLists(five_utr_dict, transcript_id, gff_obj) # elif gff_obj.feature == '3UTR': # updateDictOfLists(three_utr_dict, transcript_id, gff_obj) # elif gff_obj.feature == 'start_codon': # updateDictOfLists(start_codon_dict, transcript_id, gff_obj) # elif gff_obj.feature == 'stop_codon': # updateDictOfLists(stop_codon_dict, transcript_id, gff_obj) # elif gff_obj.feature == 'CDS': # updateDictOfLists(cds_dict, transcript_id, gff_obj) if gff_obj.feature == 'exon': updateDictOfLists(exon_dict, transcript_id, gff_obj) else: print "Not using feature %s" % (gff_obj.feature) buildExonTable(db, chr, transcript_id2strand, five_utr_dict, three_utr_dict, start_codon_dict, stop_codon_dict, cds_dict, exon_dict, db_name, use_gene_name) # insertFeature(db, "cds", cds_dict, db_name) # insertFeature(db, "start_codon", start_codon_dict, db_name) # insertFeature(db, "stop_codon", stop_codon_dict, db_name) # insertFeature(db, "five_utr", five_utr_dict, db_name) # insertFeature(db, "three_utr", three_utr_dict, db_name) buildGeneTable(db, db_name, chr) inferIntrons(db, db_name, chr)
def buildAnnotDB(db, chr2gtf_lines, db_name, use_gene_name): for chr in chr2gtf_lines: # Dictionary to hold information before creating exon entries. # Dictionaries will be of the form {transcript_id: [gff_obj]} five_utr_dict = {} three_utr_dict = {} start_codon_dict = {} stop_codon_dict = {} cds_dict = {} exon_dict = {} # Holds transcript and strand information separately # {transcript_id: strand} transcript_id2strand = {} for line in chr2gtf_lines[chr]: gff_obj = GFF_Record(line) if gff_obj.feature != 'exon': continue transcript_id = get_transcript_id(gff_obj) # Update transcript_id2strand dictionary if not transcript_id in transcript_id2strand: transcript_id2strand[transcript_id] = gff_obj.strand # Update all feature dictionaries # if gff_obj.feature == '5UTR': # updateDictOfLists(five_utr_dict, transcript_id, gff_obj) # elif gff_obj.feature == '3UTR': # updateDictOfLists(three_utr_dict, transcript_id, gff_obj) # elif gff_obj.feature == 'start_codon': # updateDictOfLists(start_codon_dict, transcript_id, gff_obj) # elif gff_obj.feature == 'stop_codon': # updateDictOfLists(stop_codon_dict, transcript_id, gff_obj) # elif gff_obj.feature == 'CDS': # updateDictOfLists(cds_dict, transcript_id, gff_obj) if gff_obj.feature == 'exon': updateDictOfLists(exon_dict, transcript_id, gff_obj) else: print "Not using feature %s" % (gff_obj.feature) buildExonTable(db, chr, transcript_id2strand, five_utr_dict, three_utr_dict, start_codon_dict, stop_codon_dict, cds_dict, exon_dict, db_name, use_gene_name) # insertFeature(db, "cds", cds_dict, db_name) # insertFeature(db, "start_codon", start_codon_dict, db_name) # insertFeature(db, "stop_codon", stop_codon_dict, db_name) # insertFeature(db, "five_utr", five_utr_dict, db_name) # insertFeature(db, "three_utr", three_utr_dict, db_name) buildGeneTable(db, db_name, chr) inferIntrons(db, db_name, chr)
def make_paired_end_ie_junctions2qname(all_introns, input_dir, read_lengths, overhang, samp, paired_read_set): ie2qname_file_name = input_dir + samp + "/" + samp + "_paired_end_ie_junctions2qname.txt" ie2qname_file = open(ie2qname_file_name, "w") ie2qnames = {} for read_len in read_lengths: # Set of region coords that will be searched within the region and read # association file region_coord2ie = getTheseRegionCoords(all_introns, read_len, overhang) confident_ie_file_name = input_dir + samp + "/tmp_" + samp + "_" + repr( read_len) + "_confident_ie.txt" confident_ie_file = open(confident_ie_file_name) confident_ie_set = set([]) for line in confident_ie_file: confident_ie_set.add(formatLine(line)) confident_ie_file.close() coords_w_reads_file_name = input_dir + samp + "/tmp_" + samp + "_" + repr( read_len) + "_intron_exon_junction_coords_w_read.out" coords_w_reads_file = open(coords_w_reads_file_name) for line in coords_w_reads_file: line = formatLine(line) line_list = line.split("\t") if line_list[-1] in region_coord2ie: ie = region_coord2ie[line_list[-1]] if ie in confident_ie_set: updateDictOfLists(ie2qnames, ie, line_list[0]) coords_w_reads_file.close() for ie in ie2qnames: outline = "%s\t%s\n" % (ie, ",".join(ie2qnames[ie])) ie2qname_file.write(outline) ie2qname_file.close()
def make_paired_end_ie_junctions2qname(all_introns, input_dir, read_lengths, overhang, samp, paired_read_set): ie2qname_file_name = input_dir + samp + "/" + samp + "_paired_end_ie_junctions2qname.txt" ie2qname_file = open(ie2qname_file_name, "w") ie2qnames = {} for read_len in read_lengths: # Set of region coords that will be searched within the region and read # association file region_coord2ie = getTheseRegionCoords(all_introns, read_len, overhang) confident_ie_file_name = input_dir + samp + "/tmp_" + samp + "_" + repr(read_len) + "_confident_ie.txt" confident_ie_file = open(confident_ie_file_name) confident_ie_set = set([]) for line in confident_ie_file: confident_ie_set.add(formatLine(line)) confident_ie_file.close() coords_w_reads_file_name = input_dir + samp + "/tmp_" + samp + "_" + repr(read_len) + "_intron_exon_junction_coords_w_read.out" coords_w_reads_file = open(coords_w_reads_file_name) for line in coords_w_reads_file: line = formatLine(line) line_list = line.split("\t") if line_list[-1] in region_coord2ie: ie = region_coord2ie[line_list[-1]] if ie in confident_ie_set: updateDictOfLists(ie2qnames, ie, line_list[0]) coords_w_reads_file.close() for ie in ie2qnames: outline = "%s\t%s\n" % (ie, ",".join(ie2qnames[ie])) ie2qname_file.write(outline) ie2qname_file.close()
def getGTFLines(gtf_file): """ Returns a dictionary associating chromosome to the gtf line {chr:[lines,]} """ file = open(gtf_file) chr2gtf_lines = {} for line in file: # Remove comments if line.startswith("#"): continue line = formatLine(line) chr = line.split("\t")[0] updateDictOfLists(chr2gtf_lines, chr, line) file.close() return chr2gtf_lines
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("-i", dest="intron_coords", type="string", help="""File of intron coordinates. Format: type, chr, strand, start, end""", default=None) opt_parser.add_option("-b", dest="bed_intron_coords", type="string", help="BED file of intron coordinates.", default=None) opt_parser.add_option("-a", dest="read_alignments", type="string", help="""File of alignments to genome. Format: chr, start, strand""", default=None) opt_parser.add_option("-f", dest="flanking_dist", type="int", help="""Distance away from exon intron junction to check for reads in.""", default=None) opt_parser.add_option("-o", dest="offsets", type="int", help="""Minimum number of offsets required at each exon/intron junction. Default=1""", default=1) opt_parser.add_option("-l", dest="read_length", type="int", help="Length of the reads.", default=1) opt_parser.add_option("--out_dir", dest="out_dir", type="string", help="Output files are put here.", default=None) opt_parser.add_option("--out_prefix", dest="prefix", type="string", help="Prefix attached to all output files.", default=None) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-a") opt_parser.check_required("-f") opt_parser.check_required("-l") opt_parser.check_required("--out_dir") opt_parser.check_required("--out_prefix") # Check that the COUNTING_SCRIPT path is valid if not os.path.exists(COUNTING_SCRIPT): print("Please change COUNTING_SCRIPT path.") opt_parser.print_help() sys.exit(1) if options.intron_coords and options.bed_intron_coords: print("Only one type of intron coord can be used as input.") opt_parser.print_help() sys.exit(1) if (not options.intron_coords) and (not options.bed_intron_coords): print(" Need to specify intron coordinates. See options -i or -b") opt_parser.print_help() sys.exit(1) intron_coords = None isBedFormat = False if options.intron_coords: intron_coords = open(options.intron_coords) if options.bed_intron_coords: intron_coords = open(options.bed_intron_coords) isBedFormat = True read_alignments = options.read_alignments read_length = options.read_length flanking_dist = options.flanking_dist offsets = options.offsets prefix = options.prefix out_dir = options.out_dir if not out_dir.endswith("/"): out_dir += "/" if not os.path.exists(out_dir): print("Output directory does not exist") sys.exit(1) # Intermediate Output Files out_coords_file = out_dir + prefix + "_intron_exon_junction_coords.out" out_coords = open(out_coords_file, "w") out_read_assoc_file = out_dir + prefix + "_intron_exon_junction_coords_w_read.out" # Final output out_file_name = out_dir + prefix + "_intron_exon_junction_counts.txt" out_file = open(out_file_name, "w") confident_ie_name = out_dir + prefix + "_confident_ie.txt" confident_ie_file = open(confident_ie_name, "w") # {intron_coord: {"left": (chr, start, end), # "right": (chr, start, end)} # "left" and "right" being the region at the left or right side of the # junction, around the exon/intron junction # The dict is the above but reverse mapping left_region_coord2intron = {} right_region_coord2intron = {} # {intron_coord_str:{"left":{pos:count}, # "right":{pos:count}} intron_dict = {} regions_set = set([]) for line in intron_coords: line = formatLine(line) if isBedFormat: if line.startswith("track"): continue chr, start_str, end_str = parseBEDLine(line) else: type, chr, strand, start_str, end_str = line.split("\t") if chr.startswith("chr"): chr = chr.replace("chr", "") intron_coord_str = "%s:%s-%s" % (chr, start_str, end_str) if intron_coord_str not in intron_dict: intron_dict[intron_coord_str] = {"left": {}, "right": {}} start = int(start_str) end = int(end_str) left_coord = (chr, start - flanking_dist, start + flanking_dist - 1) right_coord = (chr, end - flanking_dist + 1, end + flanking_dist) updateDictOfLists(left_region_coord2intron, left_coord, intron_coord_str) updateDictOfLists(right_region_coord2intron, right_coord, intron_coord_str) regions_set.add(left_coord) regions_set.add(right_coord) # Print out regions out_coords for region_coord in regions_set: out_line = "%s\t%d\t%d\n" % (region_coord[0], region_coord[1], region_coord[2]) out_coords.write(out_line) out_coords.close() # Used to make unique name for tmp file in case a shared directory is being # used for runs. rand_num = random.randrange(1, 100000) # Get Read Counts print("Getting Counts in Region") cmd = "python %s --reads %s -l %d --coords %s -o %stmp%d.txt --read_assoc %s" % ( COUNTING_SCRIPT, read_alignments, read_length, out_coords_file, out_dir, rand_num, out_read_assoc_file) print(cmd) # runCmd(cmd, SHELL) os.system(cmd) # Remove the tmp file # runCmd("rm %stmp%d.txt" % (out_dir, rand_num), SHELL) os.system("rm %stmp%d.txt" % (out_dir, rand_num)) print("Getting Left and Right Counts") # Parse read_assoc_file to get information read_assoc_file = open(out_read_assoc_file) for line in read_assoc_file: line = formatLine(line) line_list = line.split("\t") read_start, read_end = getReadStartEnd(line_list[1]) region_coord = getRegionCoord(line_list[2]) intron_coord_list = getIntronStartEnds(left_region_coord2intron, right_region_coord2intron, region_coord) if region_coord in left_region_coord2intron: for intron_str in left_region_coord2intron[region_coord]: # Put in left dictionaries if read_end not in intron_dict[intron_str]["left"]: intron_dict[intron_str]["left"][read_end] = 1 else: intron_dict[intron_str]["left"][read_end] += 1 if region_coord in right_region_coord2intron: for intron_str in right_region_coord2intron[region_coord]: # Check right dictionary if read_end not in intron_dict[intron_str]["right"]: intron_dict[intron_str]["right"][read_end] = 1 else: intron_dict[intron_str]["right"][read_end] += 1 # Print output confident_ie_set = set([]) for intron_str in intron_dict: # chr, intron_start_str, intron_end_str = intron_str.split("_") # intron_start = int(intron_start_str) # intron_end = int(intron_end_str) chr, intron_start, intron_end = convertCoordStr(intron_str) # Get left_counts if len(intron_dict[intron_str]["left"]) >= offsets: left_count = getTotalCounts(intron_dict[intron_str]["left"]) confident_ie = "%s:%d-%d" % (chr, intron_start - 1, intron_start) confident_ie_set.add(confident_ie) else: left_count = 0 # Get right counts if len(intron_dict[intron_str]["right"]) >= offsets: right_count = getTotalCounts(intron_dict[intron_str]["right"]) confident_ie = "%s:%d-%d" % (chr, intron_end, intron_end + 1) confident_ie_set.add(confident_ie) else: right_count = 0 if left_count == 0 and right_count == 0: continue print_line = "%s\t%d\t%d\n" % (intron_str, left_count, right_count) out_file.write(print_line) # Now print out confident set of ie for ie in confident_ie_set: confident_ie_file.write("%s\n" % ie) confident_ie_file.close() sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("--initialize", dest="initialize", action="store_true", help="""Will split up the gtf file into separate temp files and initalize the database.""", default=False) opt_parser.add_option("--tmp_dir", dest="tmp_dir", type="string", help="""Directory to place temporary files and to look for temporary files.""", default=None) opt_parser.add_option("--keep_temp", dest="keep_temp", action="store_true", help="""TEMP FILES ARE KEPT FOR NOW. Will keep the temporary gtf files. Default is to delete them.""", default=False) opt_parser.add_option("-g", dest="gtf_file", type="string", help="GTF annotation file.", default=None) opt_parser.add_option("--use_gene_name", dest="use_gene_name", action="store_true", help="""By default, the gene_id attribute will be used for the gene name used in the database, but the gene_name attribute can be used instead.""", default=False) # May revisit this option, but do not need now # opt_parser.add_option("-f", # dest="genome_file_name", # type="string", # help="""Fasta file containing all chromosome # sequences. If this option is given, exon and # intron sequences will be stored in the # database as well. Chromosome names must be the # same format as in the gtf file.""", # default=None) opt_parser.add_option("-d", dest="db_name", type="string", help="Name of the new database", default=None) opt_parser.add_option("--sqlite_db_dir", dest="sqlite_db_dir", type="string", help="Location to put sqlite database. Default=%s" % DB_DIR, default=DB_DIR) opt_parser.add_option("-p", dest="num_processes", type="int", help="""Will run getASEventReadCounts.py simultaneously with this many samples. Default=%d""" % DEF_NUM_PROCESSES, default=DEF_NUM_PROCESSES) opt_parser.add_option("--LSF", dest="run_lsf", action="store_true", help="""Will launch jobs on LSF. Default is running on local.""", default=False) opt_parser.add_option("--force", dest="force", action="store_true", help="""By default, will check for the existence of the final output before running commands. This option will force all runs.""", default=False) opt_parser.add_option("--check", dest="check", action="store_true", help="""Will check samples that are not done and print out which need to still be run""", default=False) opt_parser.add_option("--print_cmd", dest="print_cmd", action="store_true", help="""Will print commands that will be run, but will not run them. Used for debugging.""", default=False) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-g") opt_parser.check_required("--tmp_dir") opt_parser.check_required("-d") gtf_file_name = options.gtf_file tmp_dir = formatDir(options.tmp_dir) db_name = options.db_name sqlite_db_dir = options.sqlite_db_dir num_processes = options.num_processes run_lsf = options.run_lsf force = options.force check = options.check print_cmd = options.print_cmd ############## # INITIALIZE # ############## # If it's initilalizing, split gtf file and initialize database return if options.initialize: chr2lines = {} gtf_file_path = gtf_file_name gtf_file_name = gtf_file_name.split("/")[-1] gtf_file_comp = gtf_file_name.split(".") gtf_file_prefix = ".".join(gtf_file_comp[:-1]) gtf_file = open(gtf_file_path) for line in gtf_file: this_chr = line.split("\t")[0] updateDictOfLists(chr2lines, this_chr, line) gtf_file.close() for chr in chr2lines: tmp_chr_file = open("%s/%s_%s.gtf" % (tmp_dir, gtf_file_prefix, chr), "w") for line in chr2lines[chr]: tmp_chr_file.write(line) tmp_chr_file.close() # Now initialize the database cmd = "python %s " % SCRIPT cmd += "--initialize -d %s" % db_name os.system(cmd) sys.exit(0) ################## # BUILD DATABASE # ################## db = DB(sqlite_db_dir) # Use gtf file to figure out temp file names, Build the database from them tmp_file_list = [] gtf_file_name = gtf_file_name.split("/")[-1] gtf_file_comp = gtf_file_name.split(".") gtf_file_prefix = ".".join(gtf_file_comp[:-1]) for this_file in os.listdir(tmp_dir): if gtf_file_prefix in this_file: if this_file == gtf_file_name: continue tmp_file_list.append(this_file) # Now run script for every chromosome file ctr = 0 for tmp_file in tmp_file_list: this_chr = getChr(tmp_dir + "/" + tmp_file) if (not force) or check: # For now, just checks that records exist in the database, It is # better to force since it difficult to really know if a chromosome was # built or not. chr_built = checkChr(db, db_name, this_chr) if chr_built: if not force: continue if check: if not chr_built: print "Chromosome %s not built" % this_chr continue ctr += 1 cmd = "python %s " % SCRIPT cmd += "-g %s/%s " % (tmp_dir, tmp_file) cmd += "-d %s " % db_name if options.use_gene_name: cmd += "--use_gene_name " cmd += "--sqlite_db_dir %s" % sqlite_db_dir if print_cmd: print cmd continue if run_lsf: runLSF(cmd, "%s.build_DB.bsub.out" % this_chr, this_chr + "build_DB", "hour") continue if ctr % num_processes == 0: os.system(cmd) else: print cmd Popen(cmd, shell=True, executable=SHELL) # Remove temp files, but first check that exons are returned from the same # chromosome in the database # if not options.keep_temp: sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option( "--initialize", dest="initialize", action="store_true", help="""Will split up the gtf file into separate temp files and initalize the database.""", default=False) opt_parser.add_option( "--tmp_dir", dest="tmp_dir", type="string", help="""Directory to place temporary files and to look for temporary files.""", default=None) opt_parser.add_option( "--keep_temp", dest="keep_temp", action="store_true", help= """TEMP FILES ARE KEPT FOR NOW. Will keep the temporary gtf files. Default is to delete them.""", default=False) opt_parser.add_option("-g", dest="gtf_file", type="string", help="GTF annotation file.", default=None) opt_parser.add_option( "--use_gene_name", dest="use_gene_name", action="store_true", help="""By default, the gene_id attribute will be used for the gene name used in the database, but the gene_name attribute can be used instead.""", default=False) # May revisit this option, but do not need now # opt_parser.add_option("-f", # dest="genome_file_name", # type="string", # help="""Fasta file containing all chromosome # sequences. If this option is given, exon and # intron sequences will be stored in the # database as well. Chromosome names must be the # same format as in the gtf file.""", # default=None) opt_parser.add_option("-d", dest="db_name", type="string", help="Name of the new database", default=None) opt_parser.add_option("--sqlite_db_dir", dest="sqlite_db_dir", type="string", help="Location to put sqlite database. Default=%s" % DB_DIR, default=DB_DIR) opt_parser.add_option("-p", dest="num_processes", type="int", help="""Will run getASEventReadCounts.py simultaneously with this many samples. Default=%d""" % DEF_NUM_PROCESSES, default=DEF_NUM_PROCESSES) opt_parser.add_option( "--LSF", dest="run_lsf", action="store_true", help="""Will launch jobs on LSF. Default is running on local.""", default=False) opt_parser.add_option("--force", dest="force", action="store_true", help="""By default, will check for the existence of the final output before running commands. This option will force all runs.""", default=False) opt_parser.add_option( "--check", dest="check", action="store_true", help="""Will check samples that are not done and print out which need to still be run""", default=False) opt_parser.add_option( "--print_cmd", dest="print_cmd", action="store_true", help="""Will print commands that will be run, but will not run them. Used for debugging.""", default=False) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-g") opt_parser.check_required("--tmp_dir") opt_parser.check_required("-d") gtf_file_name = options.gtf_file tmp_dir = formatDir(options.tmp_dir) db_name = options.db_name sqlite_db_dir = options.sqlite_db_dir num_processes = options.num_processes run_lsf = options.run_lsf force = options.force check = options.check print_cmd = options.print_cmd ############## # INITIALIZE # ############## # If it's initilalizing, split gtf file and initialize database return if options.initialize: chr2lines = {} gtf_file_path = gtf_file_name gtf_file_name = gtf_file_name.split("/")[-1] gtf_file_comp = gtf_file_name.split(".") gtf_file_prefix = ".".join(gtf_file_comp[:-1]) gtf_file = open(gtf_file_path) for line in gtf_file: this_chr = line.split("\t")[0] updateDictOfLists(chr2lines, this_chr, line) gtf_file.close() for chr in chr2lines: tmp_chr_file = open( "%s/%s_%s.gtf" % (tmp_dir, gtf_file_prefix, chr), "w") for line in chr2lines[chr]: tmp_chr_file.write(line) tmp_chr_file.close() # Now initialize the database cmd = "python %s " % SCRIPT cmd += "--initialize -d %s" % db_name os.system(cmd) sys.exit(0) ################## # BUILD DATABASE # ################## db = DB(sqlite_db_dir) # Use gtf file to figure out temp file names, Build the database from them tmp_file_list = [] gtf_file_name = gtf_file_name.split("/")[-1] gtf_file_comp = gtf_file_name.split(".") gtf_file_prefix = ".".join(gtf_file_comp[:-1]) for this_file in os.listdir(tmp_dir): if gtf_file_prefix in this_file: if this_file == gtf_file_name: continue tmp_file_list.append(this_file) # Now run script for every chromosome file ctr = 0 for tmp_file in tmp_file_list: this_chr = getChr(tmp_dir + "/" + tmp_file) if (not force) or check: # For now, just checks that records exist in the database, It is # better to force since it difficult to really know if a chromosome was # built or not. chr_built = checkChr(db, db_name, this_chr) if chr_built: if not force: continue if check: if not chr_built: print "Chromosome %s not built" % this_chr continue ctr += 1 cmd = "python %s " % SCRIPT cmd += "-g %s/%s " % (tmp_dir, tmp_file) cmd += "-d %s " % db_name if options.use_gene_name: cmd += "--use_gene_name " cmd += "--sqlite_db_dir %s" % sqlite_db_dir if print_cmd: print cmd continue if run_lsf: runLSF(cmd, "%s.build_DB.bsub.out" % this_chr, this_chr + "build_DB", "hour") continue if ctr % num_processes == 0: os.system(cmd) else: print cmd Popen(cmd, shell=True, executable=SHELL) # Remove temp files, but first check that exons are returned from the same # chromosome in the database # if not options.keep_temp: sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("--in_prefix", dest="in_prefix", type="string", help="""Prefix of output files created from createAS_CountTables. In createAS_CountTables, this is the -o option""", default=None) # opt_parser.add_option("-i", # dest="input_file", # type="string", # help="Resulting file from clusterASExons2.py", # default=None) # opt_parser.add_option("--left_intron", # dest="left_input", # type="string", # help="""Resulting file from clusterASExons2.py, which # contains the exclusion and inclusion counts # for just the left side of an intron retention # event.""", # default=None) # opt_parser.add_option("--right_intron", # dest="right_input", # type="string", # help="""Resulting file from clusterASExons2.py, which # contains the exclusion and inclusion counts # for just the right side of an intron retention # event.""", # default=None) # opt_parser.add_option("--lenNormalized_counts", # dest="lenNormalized_counts", # type="string", # help="""File containing length-normalized inclusion # exclusion counts. Used for PSI calculation, # not for statistcal significance.""", # default=None) # opt_parser.add_option("--lenNormalized_left_intron", # dest="lenNormalized_left_intron_counts", # type="string", # help="""File containing length-normalized # the left intron_retention counts. # Used for PSI calculation, not for # statistical significane.""", # default=None) # opt_parser.add_option("--lenNormalized_right_intron", # dest="lenNormalized_right_intron_counts", # type="string", # help="""File containing length-normalized # the right intron_retention counts. # Used for PSI calculation, not for # for statistical significance.""", # default=None) opt_parser.add_option("--has_virtual", dest="has_virtual", action="store_true", help="""Gives flags that a virtual reference is being used.""", default=False) opt_parser.add_option("--jcn_seq_len", dest="jcn_seq_len", type="int", help="""Junction length. Used as an option in getASEventReadCounts.py""", default=None) opt_parser.add_option("--output_dir", dest="output_dir", type="string", help="Directory to place output files.", default=None) opt_parser.add_option("--out_prefix", dest="prefix", type="string", help="Prefix of all output files. DEF=None", default=None) # opt_parser.add_option("--psi_output_most_sign", # dest="psi_output", # type="string", # help="""Output file that will contain the PSI values # for all events and samples that are # signficantly spliced.""", # default=None) # opt_parser.add_option("--psi_output_sign_by_samp", # dest="psi_output_by_samp", # type="string", # help="""Output file that will contain the PSI values # for all events and samples that are # signficantly differentially spliced where # multiple testing is not done for all samples # tested against the virtual reference""", # default=None) # opt_parser.add_option("--all_psi_output", # dest="all_psi_output", # type="string", # help="""Output file that will contain the PSI values # for all events and samples that pass minimum # count thresholds""", # default=None) # opt_parser.add_option("--left_intron_all_psi_output", # dest="left_intron_all_psi_output", # type="string", # help="""Output file that will contain the PSI values # for the left side of intron retention # samples. Not required, but used for dPSI # thresholds when taking all splice events.""", # default=None) # opt_parser.add_option("--right_intron_all_psi_output", # dest="right_intron_all_psi_output", # type="string", # help="""Output file that will contain the PSI values # for the right side of intron retention # samples. Not required, but used for dPSI # thresholds when taking all splice events.""", # default=None) # opt_parser.add_option("--recalculate_ref_psi", # dest="recalculate_ref_psi", # action="store_true", # help="""The reference PSI given in input tables # should be recalculated due to changes in # thresholding for minimum input between # length-normalized and raw counts.""", # default=False) # opt_parser.add_option("--pval_output", # dest="pval_output", # type="string", # help="""Output file that will associate the # unadjusted and adjusted p-values for all # pairs that were tested.""", # default=None) # opt_parser.add_option("--event_sum", # dest="event_sum", # type="string", # help="""Output file that will contain the sum of the # exclusion and inclusion counts for every # sample that was considered signifcantly # affected.""", # default=None) opt_parser.add_option("--thresh", dest="threshold", type="int", help="""Threshold for minimum number of total reads in an event. Default=%d""" % DEF_THRESH, default=DEF_THRESH) opt_parser.add_option("--min_dpsi_threshold", dest="dpsi_threshold", type="float", help="""Threshold for minimum delta PSI value between the sample with the smallest and largest PSI. Events with dPSI values below the threshold will not be tested or reported. Def=%.2f""" % DEF_DPSI_THRESH, default=DEF_DPSI_THRESH) opt_parser.add_option( "--method", dest="method", type="string", help="""Correction Method: "BH" - Benjamini & Hochberg, "bonferroni". Must select these strings as the option""", default=None) opt_parser.add_option("--sign_cutoff", dest="sign_cutoff", type="float", help="""Cutoff of corrected p-value significance. Default=%.2f""" % DEF_SIGN_CUTOFF, default=DEF_SIGN_CUTOFF) opt_parser.add_option("--weights", dest="weights", type="string", help="""Comma separated list of weights given in the order of the samples in the table. Weights are used to create a weighted median. Default is equal weight for all samples.""", default=None) (options, args) = opt_parser.parse_args() # validate the command line arguments # opt_parser.check_required("-i") # opt_parser.check_required("--psi_output_most_sign") # opt_parser.check_required("--pval_output") # opt_parser.check_required("--event_sum") opt_parser.check_required("--method") opt_parser.check_required("--in_prefix") opt_parser.check_required("--out_prefix") opt_parser.check_required("--jcn_seq_len") in_prefix = options.in_prefix prefix = options.prefix try: input_file = open(in_prefix + "_AS_exclusion_inclusion_counts.txt") except: print( ("""Cannot find expected file %s_AS_exclusion_inclusion_counts.txt. Please check that the same options is given from combine_createAS_CountTables""" % prefix)) opt_parser.print_help() sys.exit(1) left_input_file_name = in_prefix + "_left_intron_counts.txt" right_input_file_name = in_prefix + "_right_intron_counts.txt" sum_thresh = options.threshold sign_cutoff = options.sign_cutoff dpsi_thresh = options.dpsi_threshold left_input_file = None right_input_file = None if left_input_file_name is None: print( "Warning: No intron retention file given as input. Will not calculate IR events." ) else: left_input_file = open(left_input_file_name) right_input_file = open(right_input_file_name) output_dir = formatDir(options.output_dir) if not os.path.exists(output_dir): os.mkdir(output_dir) out_prefix = "%s/%s" % (output_dir, options.prefix) psi_out = open("%s_most_sign_PSI.txt" % out_prefix, "w") pval_out = open("%s_pairs_p_val.txt" % out_prefix, "w") has_virtual = options.has_virtual # Optional output files psi_out_by_samp = open("%s_sign_by_samp_PSI.txt" % out_prefix, "w") all_psi_output = open("%s_allPSI.txt" % out_prefix, "w") left_all_psi_output = open( "%s_left_intron_retention_allPSI.txt" % out_prefix, "w") right_all_psi_output = open( "%s_right_intron_retention_allPSI.txt" % out_prefix, "w") jcn_seq_len = options.jcn_seq_len recalculate_ref_psi = False lenNormalized_counts_event2PSIs = None lenNormalized_counts_event2total_counts = None # if options.lenNormalized_counts: # if ((not options.lenNormalized_left_intron_counts) or # (not options.lenNormalized_right_intron_counts)): # print "Need to specify all length-normalized count files." # opt_parser.print_help() # sys.exit(1) # recalculate_ref_psi = True lenNormalized_counts = open(in_prefix + "_AS_exclusion_inclusion_counts_lenNorm.txt") (lenNormalized_counts_event2total_counts, lenNormalized_counts_event2PSIs) = buildDicts(lenNormalized_counts) lenNormalized_counts.close() left_lenNormalized_counts_event2total_counts = None left_lenNormalized_counts_event2PSIs = None # if options.lenNormalized_left_intron_counts: # if ((not options.lenNormalized_counts) or # (not options.lenNormalized_right_intron_counts)): # print "Need to specify all length-normalized count files." # opt_parser.print_help() # sys.exit(1) left_lenNormalized_counts = open(in_prefix + "_left_intron_counts_lenNorm.txt") (left_lenNormalized_counts_event2total_counts, left_lenNormalized_counts_event2PSIs ) = buildDicts(left_lenNormalized_counts) left_lenNormalized_counts.close() right_lenNormalized_counts_event2total_counts = None right_lenNormalized_counts_event2PSIs = None # if options.lenNormalized_right_intron_counts: # if ((not options.lenNormalized_counts) or # (not options.lenNormalized_left_intron_counts)): # print "Need to specify all length-normalized count files." # opt_parser.print_help() # sys.exit(1) right_lenNormalized_counts = open(in_prefix + "_right_intron_counts_lenNorm.txt") (right_lenNormalized_counts_event2total_counts, right_lenNormalized_counts_event2PSIs ) = buildDicts(right_lenNormalized_counts) right_lenNormalized_counts.close() # if options.lenNormalized_counts: # if not jcn_seq_len: # print "If length normalized counts are specified, need to give jcn_seq_len" # opt_parser.print_help() # sys.exit(1) weights = None if options.weights: weights = list(map(float, options.weights.split(","))) # Use R limma package try: r.library("limma") except: print( """In order to use weighted median, please install the limma package from Bioconductor: http://www.bioconductor.org/packages/release/bioc/html/limma.html""" ) print( """In R:\nsource("http://bioconductor.org/biocLite.R")\nbiocLite("limma")""" ) event_sum = open("%s_event_sum.txt" % out_prefix, "w") if options.method != "BH" and options.method != "bonferroni": print("Wrong method indicated.") opt_parser.print_help() sys.exit(1) method_map = {"BH": "fdr_bh", "bonferroni": "bonferroni"} method = method_map[options.method] # {event_type:[pval]} event_type2pvals = {} # {event:(col1, col2):pval_idx} event2pairs2idx = {} # Additional pval holders tested by each sample against the reference # {event_type:col:[pval]} event_type2col2pvals = {} # {event:col:pval_idx} event2col2idx = {} # {event:{col:psi}} event2col2psi = {} # {event:{col:sum_counts}} event2col2sum = {} # For weighted median col2weights = None header = None total_samples = None for line in input_file: line = formatLine(line) if line.startswith("#"): header = line line_list = line.split("\t") samples = line_list[11:] total_samples = len(samples) if weights: if len(weights) != total_samples - 1: print("Weights for every sample needs to be given") opt_parser.print_help() sys.exit(1) col2weights = {} for i in range(1, total_samples): col2weights[i - 1] = weights[i - 1] continue line_list = line.split("\t") event = "\t".join(line_list[0:11]) counts = line_list[11:] # If the reference is NA, then do not calculate anything if counts[0] == NA: continue if has_virtual: # Cannot do a comparison when virtual reference is low expressed if lenNormalized_counts_event2total_counts[event][0] == NA: continue lenNormalized_psis = [None for i in range(len(counts))] if lenNormalized_counts_event2PSIs: try: lenNormalized_psis = lenNormalized_counts_event2PSIs[event] except: print(("Warning: Can't find event in lenNormalized psis: %s" % event)) continue event_type = getEventType(event) if event_type not in event_type2pvals: event_type2pvals[event_type] = [] if event_type not in event_type2col2pvals: event_type2col2pvals[event_type] = {} # Fill PSI dict for i in range(total_samples): (psi, sum_ct) = getPSI_sample_sum(counts[i], sum_thresh, lenNormalized_psis[i]) if event in event2col2psi: event2col2psi[event][i] = psi event2col2sum[event][i] = sum_ct else: event2col2psi[event] = {i: psi} event2col2sum[event] = {i: sum_ct} # Only psis in event2col2psi that passed the sum_thresh will be # present, for ref psi will be calculated from the median of the # existing values if recalculate_ref_psi and has_virtual: adj_psi, adj_totalCount = recalculateRefPSI( event2col2psi[event], lenNormalized_counts_event2total_counts[event], col2weights) event2col2psi[event][0] = adj_psi lenNormalized_counts_event2total_counts[event][0] = adj_totalCount if dPSI(event2col2psi[event]) < dpsi_thresh: for j in range(1, total_samples): if event in event2pairs2idx: event2pairs2idx[event][(0, j)] = NA else: event2pairs2idx[event] = {(0, j): NA} if event in event2col2idx: event2col2idx[event][j] = NA else: event2col2idx[event] = {j: NA} continue # Calculate p-val for intron retention later if event_type == "intron_retention": continue # Do pairwise comparisons with first column [col1_excl, col1_incl] = list(map(int, counts[0].split(";"))) if recalculate_ref_psi and has_virtual: # Need to also adjust relative counts based on new PSI col1_excl, col1_incl = adjustRefCounts( event, jcn_seq_len, lenNormalized_counts_event2total_counts[event][0], float(event2col2psi[event][0]), col1_excl, col1_incl) for j in range(1, total_samples): if j not in event_type2col2pvals[event_type]: event_type2col2pvals[event_type][j] = [] [col2_excl, col2_incl] = list(map(int, counts[j].split(";"))) # Both samples have to be non-zero if belowThreshold(sum_thresh, col1_excl, col1_incl, col2_excl, col2_incl): if event in event2pairs2idx: event2pairs2idx[event][(0, j)] = NA else: event2pairs2idx[event] = {(0, j): NA} if event in event2col2idx: event2col2idx[event][j] = NA else: event2col2idx[event] = {j: NA} continue cur_len = len(event_type2pvals[event_type]) cur_len2 = len(event_type2col2pvals[event_type][j]) if event in event2pairs2idx: event2pairs2idx[event][(0, j)] = cur_len else: event2pairs2idx[event] = {(0, j): cur_len} if event in event2col2idx: event2col2idx[event][j] = cur_len2 else: event2col2idx[event] = {j: cur_len2} _, raw_pval = scipy.stats.fisher_exact([[col1_excl, col1_incl], [col2_excl, col2_incl]]) event_type2pvals[event_type].append(raw_pval) updateDictOfLists(event_type2col2pvals[event_type], j, raw_pval) # Now calculate intron retention if left_input_file: left_events2counts = getIntronLeftRightCounts(left_input_file) right_events2counts = getIntronLeftRightCounts(right_input_file) else: left_events2counts = {} right_events2counts = {} if left_all_psi_output: left_all_psi_output.write(header + "\n") if right_all_psi_output: right_all_psi_output.write(header + "\n") for event in left_events2counts: if event not in right_events2counts: continue allPSI_elems_left = [] allPSI_elems_right = [] left_length = len(left_events2counts[event]) right_length = len(right_events2counts[event]) lenNormalized_left_psis = [None for i in range(left_length)] lenNormalized_right_psis = [None for i in range(right_length)] if left_lenNormalized_counts_event2PSIs: try: lenNormalized_left_psis = left_lenNormalized_counts_event2PSIs[ event] except: print(( "Warning: Could not find event in left_lenNormalized psis: %s" % event)) continue if right_lenNormalized_counts_event2PSIs: try: lenNormalized_right_psis = right_lenNormalized_counts_event2PSIs[ event] except: print(( "Warning: Could not find event in right_lenNormalized psis: %s" % event)) continue # Fill PSI dict for i in range(left_length): (psi, sum_ct) = getPSI_sample_sum(left_events2counts[event][i], sum_thresh, lenNormalized_left_psis[i]) allPSI_elems_left.append(psi) try: (psi, sum_ct) = getPSI_sample_sum(right_events2counts[event][i], sum_thresh, lenNormalized_right_psis[i]) except: pdb.set_trace() allPSI_elems_right.append(psi) # # Adding left and right PSI values # if left_col2_excl + left_col2_incl < sum_thresh: # allPSI_elems_left.append(NA) # else: # allPSI_elems_left.append(getPSI(left_col2_excl, left_col2_incl, # lenNormalized_left_psis[j])) # if right_col2_excl + right_col2_incl < sum_thresh: # allPSI_elems_right.append(NA) # else: # allPSI_elems_right.append(getPSI(right_col2_excl, # right_col2_incl, # lenNormalized_right_psis[j])) # Only psis in event2col2psi that passed the sum_thresh will be # present, for ref psi will be calculated from the median of the # existing values if recalculate_ref_psi and has_virtual: allPSI_elems_left[0] = recalculateRefPSI_list( allPSI_elems_left, col2weights) allPSI_elems_right[0] = recalculateRefPSI_list( allPSI_elems_right, col2weights) if dPSI(allPSI_elems_left) < dpsi_thresh or dPSI( allPSI_elems_right) < dpsi_thresh: for j in range(1, left_length): if event in event2pairs2idx: event2pairs2idx[event][(0, j)] = NA else: event2pairs2idx[event] = {(0, j): NA} if event in event2col2idx: event2col2idx[event][j] = NA else: event2col2idx[event] = {j: NA} continue [left_col1_excl, left_col1_incl ] = list(map(int, left_events2counts[event][0].split(";"))) [right_col1_excl, right_col1_incl ] = list(map(int, right_events2counts[event][0].split(";"))) if left_col1_excl + left_col1_incl < sum_thresh: continue # the reference must have a PSI if right_col1_excl + right_col1_incl < sum_thresh: continue # the reference must have a PSI # Adjust ref counts based on PSI if recalculate_ref_psi and has_virtual: left_col1_excl, left_col1_incl = adjustRefCounts( event, jcn_seq_len, left_lenNormalized_counts_event2total_counts[event][0], float(allPSI_elems_left[0]), left_col1_excl, left_col1_incl) right_col1_excl, right_col1_incl = adjustRefCounts( event, jcn_seq_len, right_lenNormalized_counts_event2total_counts[event][0], float(allPSI_elems_right[0]), right_col1_excl, right_col1_incl) for j in range(1, total_samples): [left_col2_excl, left_col2_incl ] = list(map(int, left_events2counts[event][j].split(";"))) [right_col2_excl, right_col2_incl ] = list(map(int, right_events2counts[event][j].split(";"))) if j not in event_type2col2pvals["intron_retention"]: event_type2col2pvals["intron_retention"][j] = [] # Both samples have to be non-zero if (belowThreshold(sum_thresh, left_col1_excl, left_col1_incl, left_col2_excl, left_col2_incl) or belowThreshold(sum_thresh, right_col1_excl, right_col1_incl, right_col2_excl, right_col2_incl)): if event in event2pairs2idx: event2pairs2idx[event][(0, j)] = NA else: event2pairs2idx[event] = {(0, j): NA} if event in event2col2idx: event2col2idx[event][j] = NA else: event2col2idx[event] = {j: NA} continue cur_len = len(event_type2pvals["intron_retention"]) cur_len2 = len(event_type2col2pvals["intron_retention"][j]) if event in event2pairs2idx: event2pairs2idx[event][(0, j)] = cur_len else: event2pairs2idx[event] = {(0, j): cur_len} if event in event2col2idx: event2col2idx[event][j] = cur_len2 else: event2col2idx[event] = {j: cur_len2} _, left_pval = scipy.stats.fisher_exact( [[left_col1_excl, left_col1_incl], [left_col2_excl, left_col2_incl]]) _, right_pval = scipy.stats.fisher_exact( [[right_col1_excl, right_col1_incl], [right_col2_excl, right_col2_incl]]) combined_pval = (left_pval + right_pval) - left_pval * right_pval event_type2pvals["intron_retention"].append(combined_pval) updateDictOfLists(event_type2col2pvals["intron_retention"], j, combined_pval) # All samples have been processed, now print to allPSI if left_all_psi_output: left_all_psi_output.write(event + "\t" + "\t".join(allPSI_elems_left) + "\n") if right_all_psi_output: right_all_psi_output.write(event + "\t" + "\t".join(allPSI_elems_right) + "\n") if left_all_psi_output: left_all_psi_output.close() if right_all_psi_output: right_all_psi_output.close() # All pairs have been evaluated, so now do multiple testing correction on # everything event_type2adjusted_pvals = {} event_type2col2adjusted_pvals = {} for event_type in event_type2pvals: event_type2adjusted_pvals[event_type] = list( multitest.multipletests(event_type2pvals[event_type], method=method)[1]) for event_type in event_type2col2pvals: event_type2col2adjusted_pvals[event_type] = {} for col in event_type2col2pvals[event_type]: event_type2col2adjusted_pvals[event_type][col] = list( multitest.multipletests(event_type2col2pvals[event_type][col], method=method)[1]) # Now go through all events and only consider those that are signficant psi_out.write(header + "\n") if psi_out_by_samp: psi_out_by_samp.write(header + "\n") if all_psi_output: all_psi_output.write(header + "\n") for event in event2pairs2idx: sign_cols = set([]) sign_cols2 = set([]) event_type = getEventType(event) for pair in event2pairs2idx[event]: this_idx = event2pairs2idx[event][pair] this_idx2 = event2col2idx[event][pair[1]] if this_idx == NA: continue outline = "%s\t%d\t%d\t%f" % ( event, pair[0], pair[1], event_type2pvals[event_type][this_idx]) if psi_out_by_samp: outline += "\t%f" % event_type2col2adjusted_pvals[event_type][ pair[1]][this_idx2] outline += "\t%f\n" % event_type2adjusted_pvals[event_type][ this_idx] pval_out.write(outline) if event_type2adjusted_pvals[event_type][this_idx] < sign_cutoff: sign_cols.add(pair[0]) sign_cols.add(pair[1]) if psi_out_by_samp: if event_type2col2adjusted_pvals[event_type][ pair[1]][this_idx2] < sign_cutoff: sign_cols2.add(pair[0]) sign_cols2.add(pair[1]) # Write out PSI for any significant samples # Significant across all samples if sign_cols != set([]): psi_vals = [] for i in range(total_samples): if i in sign_cols: psi_vals.append(event2col2psi[event][i]) else: psi_vals.append(NA) outline = "%s\t%s\n" % (event, "\t".join(psi_vals)) psi_out.write(outline) # Significant by samples if sign_cols2 != set([]): psi_vals = [] for i in range(total_samples): if i in sign_cols2: psi_vals.append(event2col2psi[event][i]) if event_sum: event_sum.write("%s\t%d\t%s\n" % (event, i, event2col2sum[event][i])) else: psi_vals.append(NA) outline = "%s\t%s\n" % (event, "\t".join(psi_vals)) psi_out_by_samp.write(outline) # Print all psi if all_psi_output: psi_vals = [] for i in range(total_samples): try: psi_vals.append(event2col2psi[event][i]) except: psi_vals.append(NA) outline = "%s\t%s\n" % (event, "\t".join(psi_vals)) all_psi_output.write(outline) psi_out.close() psi_out_by_samp.close() all_psi_output.close() pval_out.close() sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("--in_prefix", dest="in_prefix", type="string", help="""Prefix of output files created from createAS_CountTables. In createAS_CountTables, this is the -o option""", default=None) # opt_parser.add_option("-i", # dest="input_file", # type="string", # help="Resulting file from clusterASExons2.py", # default=None) # opt_parser.add_option("--left_intron", # dest="left_input", # type="string", # help="""Resulting file from clusterASExons2.py, which # contains the exclusion and inclusion counts # for just the left side of an intron retention # event.""", # default=None) # opt_parser.add_option("--right_intron", # dest="right_input", # type="string", # help="""Resulting file from clusterASExons2.py, which # contains the exclusion and inclusion counts # for just the right side of an intron retention # event.""", # default=None) # opt_parser.add_option("--lenNormalized_counts", # dest="lenNormalized_counts", # type="string", # help="""File containing length-normalized inclusion # exclusion counts. Used for PSI calculation, # not for statistcal significance.""", # default=None) # opt_parser.add_option("--lenNormalized_left_intron", # dest="lenNormalized_left_intron_counts", # type="string", # help="""File containing length-normalized # the left intron_retention counts. # Used for PSI calculation, not for # statistical significane.""", # default=None) # opt_parser.add_option("--lenNormalized_right_intron", # dest="lenNormalized_right_intron_counts", # type="string", # help="""File containing length-normalized # the right intron_retention counts. # Used for PSI calculation, not for # for statistical significance.""", # default=None) opt_parser.add_option("--has_virtual", dest="has_virtual", action="store_true", help="""Gives flags that a virtual reference is being used.""", default=False) opt_parser.add_option("--jcn_seq_len", dest="jcn_seq_len", type="int", help="""Junction length. Used as an option in getASEventReadCounts.py""", default=None) opt_parser.add_option("--output_dir", dest="output_dir", type="string", help="Directory to place output files.", default=None) opt_parser.add_option("--out_prefix", dest="prefix", type="string", help="Prefix of all output files. DEF=None", default=None) # opt_parser.add_option("--psi_output_most_sign", # dest="psi_output", # type="string", # help="""Output file that will contain the PSI values # for all events and samples that are # signficantly spliced.""", # default=None) # opt_parser.add_option("--psi_output_sign_by_samp", # dest="psi_output_by_samp", # type="string", # help="""Output file that will contain the PSI values # for all events and samples that are # signficantly differentially spliced where # multiple testing is not done for all samples # tested against the virtual reference""", # default=None) # opt_parser.add_option("--all_psi_output", # dest="all_psi_output", # type="string", # help="""Output file that will contain the PSI values # for all events and samples that pass minimum # count thresholds""", # default=None) # opt_parser.add_option("--left_intron_all_psi_output", # dest="left_intron_all_psi_output", # type="string", # help="""Output file that will contain the PSI values # for the left side of intron retention # samples. Not required, but used for dPSI # thresholds when taking all splice events.""", # default=None) # opt_parser.add_option("--right_intron_all_psi_output", # dest="right_intron_all_psi_output", # type="string", # help="""Output file that will contain the PSI values # for the right side of intron retention # samples. Not required, but used for dPSI # thresholds when taking all splice events.""", # default=None) # opt_parser.add_option("--recalculate_ref_psi", # dest="recalculate_ref_psi", # action="store_true", # help="""The reference PSI given in input tables # should be recalculated due to changes in # thresholding for minimum input between # length-normalized and raw counts.""", # default=False) # opt_parser.add_option("--pval_output", # dest="pval_output", # type="string", # help="""Output file that will associate the # unadjusted and adjusted p-values for all # pairs that were tested.""", # default=None) # opt_parser.add_option("--event_sum", # dest="event_sum", # type="string", # help="""Output file that will contain the sum of the # exclusion and inclusion counts for every # sample that was considered signifcantly # affected.""", # default=None) opt_parser.add_option("--thresh", dest="threshold", type="int", help="""Threshold for minimum number of total reads in an event. Default=%d""" % DEF_THRESH, default=DEF_THRESH) opt_parser.add_option("--min_dpsi_threshold", dest="dpsi_threshold", type="float", help="""Threshold for minimum delta PSI value between the sample with the smallest and largest PSI. Events with dPSI values below the threshold will not be tested or reported. Def=%.2f""" % DEF_DPSI_THRESH, default=DEF_DPSI_THRESH) opt_parser.add_option("--method", dest="method", type="string", help="""Correction Method: "BH" - Benjamini & Hochberg, "bonferroni". Must select these strings as the option""", default=None) opt_parser.add_option("--sign_cutoff", dest="sign_cutoff", type="float", help="""Cutoff of corrected p-value significance. Default=%.2f""" % DEF_SIGN_CUTOFF, default=DEF_SIGN_CUTOFF) opt_parser.add_option("--weights", dest="weights", type="string", help="""Comma separated list of weights given in the order of the samples in the table. Weights are used to create a weighted median. Default is equal weight for all samples.""", default=None) (options, args) = opt_parser.parse_args() # validate the command line arguments # opt_parser.check_required("-i") # opt_parser.check_required("--psi_output_most_sign") # opt_parser.check_required("--pval_output") # opt_parser.check_required("--event_sum") opt_parser.check_required("--method") opt_parser.check_required("--in_prefix") opt_parser.check_required("--out_prefix") opt_parser.check_required("--jcn_seq_len") in_prefix = options.in_prefix prefix = options.prefix try: input_file = open(in_prefix + "_AS_exclusion_inclusion_counts.txt") except: print """Cannot find expected file %s_AS_exclusion_inclusion_counts.txt. Please check that the same options is given from combine_createAS_CountTables""" % prefix opt_parser.print_help() sys.exit(1) left_input_file_name = in_prefix + "_left_intron_counts.txt" right_input_file_name = in_prefix + "_right_intron_counts.txt" sum_thresh = options.threshold sign_cutoff = options.sign_cutoff dpsi_thresh = options.dpsi_threshold left_input_file = None right_input_file = None if left_input_file_name is None: print "Warning: No intron retention file given as input. Will not calculate IR events." else: left_input_file = open(left_input_file_name) right_input_file = open(right_input_file_name) output_dir = formatDir(options.output_dir) if not os.path.exists(output_dir): os.mkdir(output_dir) out_prefix = "%s/%s" % (output_dir, options.prefix) psi_out = open("%s_most_sign_PSI.txt" % out_prefix, "w") pval_out = open("%s_pairs_p_val.txt" % out_prefix, "w") has_virtual = options.has_virtual # Optional output files psi_out_by_samp = open("%s_sign_by_samp_PSI.txt" % out_prefix, "w") all_psi_output = open("%s_allPSI.txt" % out_prefix, "w") left_all_psi_output = open("%s_left_intron_retention_allPSI.txt" % out_prefix, "w") right_all_psi_output = open("%s_right_intron_retention_allPSI.txt" % out_prefix, "w") jcn_seq_len = options.jcn_seq_len recalculate_ref_psi = False lenNormalized_counts_event2PSIs = None lenNormalized_counts_event2total_counts = None # if options.lenNormalized_counts: # if ((not options.lenNormalized_left_intron_counts) or # (not options.lenNormalized_right_intron_counts)): # print "Need to specify all length-normalized count files." # opt_parser.print_help() # sys.exit(1) # recalculate_ref_psi = True lenNormalized_counts = open(in_prefix + "_AS_exclusion_inclusion_counts_lenNorm.txt") (lenNormalized_counts_event2total_counts, lenNormalized_counts_event2PSIs) = buildDicts(lenNormalized_counts) lenNormalized_counts.close() left_lenNormalized_counts_event2total_counts = None left_lenNormalized_counts_event2PSIs = None # if options.lenNormalized_left_intron_counts: # if ((not options.lenNormalized_counts) or # (not options.lenNormalized_right_intron_counts)): # print "Need to specify all length-normalized count files." # opt_parser.print_help() # sys.exit(1) left_lenNormalized_counts = open(in_prefix + "_left_intron_counts_lenNorm.txt") (left_lenNormalized_counts_event2total_counts, left_lenNormalized_counts_event2PSIs) = buildDicts(left_lenNormalized_counts) left_lenNormalized_counts.close() right_lenNormalized_counts_event2total_counts = None right_lenNormalized_counts_event2PSIs = None # if options.lenNormalized_right_intron_counts: # if ((not options.lenNormalized_counts) or # (not options.lenNormalized_left_intron_counts)): # print "Need to specify all length-normalized count files." # opt_parser.print_help() # sys.exit(1) right_lenNormalized_counts = open(in_prefix + "_right_intron_counts_lenNorm.txt") (right_lenNormalized_counts_event2total_counts, right_lenNormalized_counts_event2PSIs) = buildDicts(right_lenNormalized_counts) right_lenNormalized_counts.close() # if options.lenNormalized_counts: # if not jcn_seq_len: # print "If length normalized counts are specified, need to give jcn_seq_len" # opt_parser.print_help() # sys.exit(1) weights = None if options.weights: weights = map(float, options.weights.split(",")) # Use R limma package try: r.library("limma") except: print """In order to use weighted median, please install the limma package from Bioconductor: http://www.bioconductor.org/packages/release/bioc/html/limma.html""" print """In R:\nsource("http://bioconductor.org/biocLite.R")\nbiocLite("limma")""" event_sum = open("%s_event_sum.txt" % out_prefix, "w") method = options.method if method != "BH" and method != "bonferroni": print "Wrong method indicated." opt_parser.print_help() sys.exit(1) # {event_type:[pval]} event_type2pvals = {} # {event:(col1, col2):pval_idx} event2pairs2idx = {} # Additional pval holders tested by each sample against the reference # {event_type:col:[pval]} event_type2col2pvals = {} # {event:col:pval_idx} event2col2idx = {} # {event:{col:psi}} event2col2psi = {} # {event:{col:sum_counts}} event2col2sum = {} # For weighted median col2weights = None header = None total_samples = None for line in input_file: line = formatLine(line) if line.startswith("#"): header = line line_list = line.split("\t") samples = line_list[11:] total_samples = len(samples) if weights: if len(weights) != total_samples-1: print "Weights for every sample needs to be given" opt_parser.print_help() sys.exit(1) col2weights = {} for i in range(1,total_samples): col2weights[i-1] = weights[i-1] continue line_list = line.split("\t") event = "\t".join(line_list[0:11]) counts = line_list[11:] # If the reference is NA, then do not calculate anything if counts[0] == NA: continue if has_virtual: # Cannot do a comparison when virtual reference is low expressed if lenNormalized_counts_event2total_counts[event][0] == NA: continue lenNormalized_psis = [None for i in range(len(counts))] if lenNormalized_counts_event2PSIs: try: lenNormalized_psis = lenNormalized_counts_event2PSIs[event] except: print "Warning: Can't find event in lenNormalized psis: %s" % event continue event_type = getEventType(event) if event_type not in event_type2pvals: event_type2pvals[event_type] = [] if event_type not in event_type2col2pvals: event_type2col2pvals[event_type] = {} # Fill PSI dict for i in range(total_samples): (psi, sum_ct) = getPSI_sample_sum(counts[i], sum_thresh, lenNormalized_psis[i]) if event in event2col2psi: event2col2psi[event][i] = psi event2col2sum[event][i] = sum_ct else: event2col2psi[event] = {i:psi} event2col2sum[event] = {i:sum_ct} # Only psis in event2col2psi that passed the sum_thresh will be # present, for ref psi will be calculated from the median of the # existing values if recalculate_ref_psi and has_virtual: adj_psi, adj_totalCount = recalculateRefPSI(event2col2psi[event], lenNormalized_counts_event2total_counts[event], col2weights) event2col2psi[event][0] = adj_psi lenNormalized_counts_event2total_counts[event][0] = adj_totalCount if dPSI(event2col2psi[event]) < dpsi_thresh: for j in range(1,total_samples): if event in event2pairs2idx: event2pairs2idx[event][(0,j)] = NA else: event2pairs2idx[event] = {(0,j):NA} if event in event2col2idx: event2col2idx[event][j] = NA else: event2col2idx[event] = {j:NA} continue # Calculate p-val for intron retention later if event_type == "intron_retention": continue # Do pairwise comparisons with first column [col1_excl, col1_incl] = map(int,counts[0].split(";")) if recalculate_ref_psi and has_virtual: # Need to also adjust relative counts based on new PSI col1_excl, col1_incl = adjustRefCounts(event, jcn_seq_len, lenNormalized_counts_event2total_counts[event][0], float(event2col2psi[event][0]), col1_excl, col1_incl) for j in range(1,total_samples): if j not in event_type2col2pvals[event_type]: event_type2col2pvals[event_type][j] = [] [col2_excl, col2_incl] = map(int,counts[j].split(";")) # Both samples have to be non-zero if belowThreshold(sum_thresh, col1_excl, col1_incl, col2_excl, col2_incl): if event in event2pairs2idx: event2pairs2idx[event][(0,j)] = NA else: event2pairs2idx[event] = {(0,j):NA} if event in event2col2idx: event2col2idx[event][j] = NA else: event2col2idx[event] = {j:NA} continue cur_len = len(event_type2pvals[event_type]) cur_len2 = len(event_type2col2pvals[event_type][j]) if event in event2pairs2idx: event2pairs2idx[event][(0,j)] = cur_len else: event2pairs2idx[event] = {(0,j):cur_len} if event in event2col2idx: event2col2idx[event][j] = cur_len2 else: event2col2idx[event] = {j:cur_len2} raw_pval = robjects.r['fisher.test'](r.matrix(robjects.IntVector([col1_excl, col1_incl, col2_excl, col2_incl]), nrow=2))[0][0] event_type2pvals[event_type].append(raw_pval) updateDictOfLists(event_type2col2pvals[event_type], j, raw_pval) # Now calculate intron retention if left_input_file: left_events2counts = getIntronLeftRightCounts(left_input_file) right_events2counts = getIntronLeftRightCounts(right_input_file) else: left_events2counts = {} right_events2counts = {} if left_all_psi_output: left_all_psi_output.write(header + "\n") if right_all_psi_output: right_all_psi_output.write(header + "\n") for event in left_events2counts: if event not in right_events2counts: continue allPSI_elems_left = [] allPSI_elems_right = [] left_length = len(left_events2counts[event]) right_length = len(right_events2counts[event]) lenNormalized_left_psis = [None for i in range(left_length)] lenNormalized_right_psis = [None for i in range(right_length)] if left_lenNormalized_counts_event2PSIs: try: lenNormalized_left_psis = left_lenNormalized_counts_event2PSIs[event] except: print "Warning: Could not find event in left_lenNormalized psis: %s" % event continue if right_lenNormalized_counts_event2PSIs: try: lenNormalized_right_psis = right_lenNormalized_counts_event2PSIs[event] except: print "Warning: Could not find event in right_lenNormalized psis: %s" % event continue # Fill PSI dict for i in range(left_length): (psi, sum_ct) = getPSI_sample_sum(left_events2counts[event][i], sum_thresh, lenNormalized_left_psis[i]) allPSI_elems_left.append(psi) try: (psi, sum_ct) = getPSI_sample_sum(right_events2counts[event][i], sum_thresh, lenNormalized_right_psis[i]) except: pdb.set_trace() allPSI_elems_right.append(psi) # # Adding left and right PSI values # if left_col2_excl + left_col2_incl < sum_thresh: # allPSI_elems_left.append(NA) # else: # allPSI_elems_left.append(getPSI(left_col2_excl, left_col2_incl, # lenNormalized_left_psis[j])) # if right_col2_excl + right_col2_incl < sum_thresh: # allPSI_elems_right.append(NA) # else: # allPSI_elems_right.append(getPSI(right_col2_excl, # right_col2_incl, # lenNormalized_right_psis[j])) # Only psis in event2col2psi that passed the sum_thresh will be # present, for ref psi will be calculated from the median of the # existing values if recalculate_ref_psi and has_virtual: allPSI_elems_left[0] = recalculateRefPSI_list(allPSI_elems_left, col2weights) allPSI_elems_right[0] = recalculateRefPSI_list(allPSI_elems_right, col2weights) if dPSI(allPSI_elems_left) < dpsi_thresh or dPSI(allPSI_elems_right) < dpsi_thresh: for j in range(1,left_length): if event in event2pairs2idx: event2pairs2idx[event][(0,j)] = NA else: event2pairs2idx[event] = {(0,j):NA} if event in event2col2idx: event2col2idx[event][j] = NA else: event2col2idx[event] = {j:NA} continue [left_col1_excl, left_col1_incl] = map(int,left_events2counts[event][0].split(";")) [right_col1_excl, right_col1_incl] = map(int,right_events2counts[event][0].split(";")) if left_col1_excl + left_col1_incl < sum_thresh: continue # the reference must have a PSI if right_col1_excl + right_col1_incl < sum_thresh: continue # the reference must have a PSI # Adjust ref counts based on PSI if recalculate_ref_psi and has_virtual: left_col1_excl, left_col1_incl = adjustRefCounts(event, jcn_seq_len, left_lenNormalized_counts_event2total_counts[event][0], float(allPSI_elems_left[0]), left_col1_excl, left_col1_incl) right_col1_excl, right_col1_incl = adjustRefCounts(event, jcn_seq_len, right_lenNormalized_counts_event2total_counts[event][0], float(allPSI_elems_right[0]), right_col1_excl, right_col1_incl) for j in range(1,total_samples): [left_col2_excl, left_col2_incl] = map(int,left_events2counts[event][j].split(";")) [right_col2_excl, right_col2_incl] = map(int,right_events2counts[event][j].split(";")) if j not in event_type2col2pvals["intron_retention"]: event_type2col2pvals["intron_retention"][j] = [] # Both samples have to be non-zero if (belowThreshold(sum_thresh, left_col1_excl, left_col1_incl, left_col2_excl, left_col2_incl) or belowThreshold(sum_thresh, right_col1_excl, right_col1_incl, right_col2_excl, right_col2_incl)): if event in event2pairs2idx: event2pairs2idx[event][(0,j)] = NA else: event2pairs2idx[event] = {(0,j):NA} if event in event2col2idx: event2col2idx[event][j] = NA else: event2col2idx[event] = {j:NA} continue cur_len = len(event_type2pvals["intron_retention"]) cur_len2 = len(event_type2col2pvals["intron_retention"][j]) if event in event2pairs2idx: event2pairs2idx[event][(0,j)] = cur_len else: event2pairs2idx[event] = {(0,j):cur_len} if event in event2col2idx: event2col2idx[event][j] = cur_len2 else: event2col2idx[event] = {j:cur_len2} left_pval = robjects.r['fisher.test'](r.matrix(robjects.IntVector([left_col1_excl, left_col1_incl, left_col2_excl, left_col2_incl]), nrow=2))[0][0] right_pval = robjects.r['fisher.test'](r.matrix(robjects.IntVector([right_col1_excl, right_col1_incl, right_col2_excl, right_col2_incl]), nrow=2))[0][0] combined_pval = (left_pval + right_pval) - left_pval * right_pval event_type2pvals["intron_retention"].append(combined_pval) updateDictOfLists(event_type2col2pvals["intron_retention"], j, combined_pval) # All samples have been processed, now print to allPSI if left_all_psi_output: left_all_psi_output.write(event + "\t" + "\t".join(allPSI_elems_left) + "\n") if right_all_psi_output: right_all_psi_output.write(event + "\t" + "\t".join(allPSI_elems_right) + "\n") if left_all_psi_output: left_all_psi_output.close() if right_all_psi_output: right_all_psi_output.close() # All pairs have been evaluated, so now do multiple testing correction on # everything event_type2adjusted_pvals = {} event_type2col2adjusted_pvals = {} for event_type in event_type2pvals: event_type2adjusted_pvals[event_type] = robjects.r['p.adjust'](robjects.FloatVector(event_type2pvals[event_type]), method) for event_type in event_type2col2pvals: event_type2col2adjusted_pvals[event_type] = {} for col in event_type2col2pvals[event_type]: event_type2col2adjusted_pvals[event_type][col] = robjects.r['p.adjust'](robjects.FloatVector(event_type2col2pvals[event_type][col]), method) # Now go through all events and only consider those that are signficant psi_out.write(header + "\n") if psi_out_by_samp: psi_out_by_samp.write(header + "\n") if all_psi_output: all_psi_output.write(header + "\n") for event in event2pairs2idx: sign_cols = set([]) sign_cols2 = set([]) event_type = getEventType(event) for pair in event2pairs2idx[event]: this_idx = event2pairs2idx[event][pair] this_idx2 = event2col2idx[event][pair[1]] if this_idx == NA: continue outline = "%s\t%d\t%d\t%f" % (event, pair[0], pair[1], event_type2pvals[event_type][this_idx]) if psi_out_by_samp: outline += "\t%f" % event_type2col2adjusted_pvals[event_type][pair[1]][this_idx2] outline += "\t%f\n" % event_type2adjusted_pvals[event_type][this_idx] pval_out.write(outline) if event_type2adjusted_pvals[event_type][this_idx] < sign_cutoff: sign_cols.add(pair[0]) sign_cols.add(pair[1]) if psi_out_by_samp: if event_type2col2adjusted_pvals[event_type][pair[1]][this_idx2] < sign_cutoff: sign_cols2.add(pair[0]) sign_cols2.add(pair[1]) # Write out PSI for any significant samples # Significant across all samples if sign_cols != set([]): psi_vals = [] for i in range(total_samples): if i in sign_cols: psi_vals.append(event2col2psi[event][i]) else: psi_vals.append(NA) outline = "%s\t%s\n" % (event, "\t".join(psi_vals)) psi_out.write(outline) # Significant by samples if sign_cols2 != set([]): psi_vals = [] for i in range(total_samples): if i in sign_cols2: psi_vals.append(event2col2psi[event][i]) if event_sum: event_sum.write("%s\t%d\t%s\n" % (event, i, event2col2sum[event][i])) else: psi_vals.append(NA) outline = "%s\t%s\n" % (event, "\t".join(psi_vals)) psi_out_by_samp.write(outline) # Print all psi if all_psi_output: psi_vals = [] for i in range(total_samples): try: psi_vals.append(event2col2psi[event][i]) except: psi_vals.append(NA) outline = "%s\t%s\n" % (event, "\t".join(psi_vals)) all_psi_output.write(outline) psi_out.close() psi_out_by_samp.close() all_psi_output.close() pval_out.close() sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("-i", dest="intron_coords", type="string", help="""File of intron coordinates. Format: type, chr, strand, start, end""", default=None) opt_parser.add_option("-b", dest="bed_intron_coords", type="string", help="BED file of intron coordinates.", default=None) opt_parser.add_option("-a", dest="read_alignments", type="string", help="""File of alignments to genome. Format: chr, start, strand""", default=None) opt_parser.add_option("-f", dest="flanking_dist", type="int", help="""Distance away from exon intron junction to check for reads in.""", default=None) opt_parser.add_option("-o", dest="offsets", type="int", help="""Minimum number of offsets required at each exon/intron junction. Default=1""", default=1) opt_parser.add_option("-l", dest="read_length", type="int", help="Length of the reads.", default=1) opt_parser.add_option("--out_dir", dest="out_dir", type="string", help="Output files are put here.", default=None) opt_parser.add_option("--out_prefix", dest="prefix", type="string", help="Prefix attached to all output files.", default=None) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-a") opt_parser.check_required("-f") opt_parser.check_required("-l") opt_parser.check_required("--out_dir") opt_parser.check_required("--out_prefix") # Check that the COUNTING_SCRIPT path is valid if not os.path.exists(COUNTING_SCRIPT): print "Please change COUNTING_SCRIPT path." opt_parser.print_help() sys.exit(1) if options.intron_coords and options.bed_intron_coords: print "Only one type of intron coord can be used as input." opt_parser.print_help() sys.exit(1) if (not options.intron_coords) and (not options.bed_intron_coords): print " Need to specify intron coordinates. See options -i or -b" opt_parser.print_help() sys.exit(1) intron_coords = None isBedFormat = False if options.intron_coords: intron_coords = open(options.intron_coords) if options.bed_intron_coords: intron_coords = open(options.bed_intron_coords) isBedFormat = True read_alignments = options.read_alignments read_length = options.read_length flanking_dist = options.flanking_dist offsets = options.offsets prefix = options.prefix out_dir = options.out_dir if not out_dir.endswith("/"): out_dir += "/" if not os.path.exists(out_dir): print "Output directory does not exist" sys.exit(1) # Intermediate Output Files out_coords_file = out_dir + prefix + "_intron_exon_junction_coords.out" out_coords = open(out_coords_file, "w") out_read_assoc_file = out_dir + prefix + "_intron_exon_junction_coords_w_read.out" # Final output out_file_name = out_dir + prefix + "_intron_exon_junction_counts.txt" out_file = open(out_file_name, "w") confident_ie_name = out_dir + prefix + "_confident_ie.txt" confident_ie_file = open(confident_ie_name, "w") # {intron_coord: {"left": (chr, start, end), # "right": (chr, start, end)} # "left" and "right" being the region at the left or right side of the # junction, around the exon/intron junction # The dict is the above but reverse mapping left_region_coord2intron = {} right_region_coord2intron = {} # {intron_coord_str:{"left":{pos:count}, # "right":{pos:count}} intron_dict = {} regions_set = set([]) for line in intron_coords: line = formatLine(line) if isBedFormat: if line.startswith("track"): continue chr, start_str, end_str = parseBEDLine(line) else: type, chr, strand, start_str, end_str = line.split("\t") if chr.startswith("chr"): chr = chr.replace("chr", "") intron_coord_str = "%s:%s-%s" % (chr, start_str, end_str) if intron_coord_str not in intron_dict: intron_dict[intron_coord_str] = {"left": {}, "right": {}} start = int(start_str) end = int(end_str) left_coord = (chr, start - flanking_dist, start + flanking_dist - 1) right_coord = (chr, end - flanking_dist + 1, end + flanking_dist) updateDictOfLists(left_region_coord2intron, left_coord, intron_coord_str) updateDictOfLists(right_region_coord2intron, right_coord, intron_coord_str) regions_set.add(left_coord) regions_set.add(right_coord) # Print out regions out_coords for region_coord in regions_set: out_line = "%s\t%d\t%d\n" % (region_coord[0], region_coord[1], region_coord[2]) out_coords.write(out_line) out_coords.close() # Used to make unique name for tmp file in case a shared directory is being # used for runs. rand_num = random.randrange(1,100000) # Get Read Counts print "Getting Counts in Region" cmd = "python %s --reads %s -l %d --coords %s -o %stmp%d.txt --read_assoc %s" % (COUNTING_SCRIPT, read_alignments, read_length, out_coords_file, out_dir, rand_num, out_read_assoc_file) print cmd # runCmd(cmd, SHELL) os.system(cmd) # Remove the tmp file # runCmd("rm %stmp%d.txt" % (out_dir, rand_num), SHELL) os.system("rm %stmp%d.txt" % (out_dir, rand_num)) print "Getting Left and Right Counts" # Parse read_assoc_file to get information read_assoc_file = open(out_read_assoc_file) for line in read_assoc_file: line = formatLine(line) line_list = line.split("\t") read_start, read_end = getReadStartEnd(line_list[1]) region_coord = getRegionCoord(line_list[2]) intron_coord_list = getIntronStartEnds(left_region_coord2intron, right_region_coord2intron, region_coord) if region_coord in left_region_coord2intron: for intron_str in left_region_coord2intron[region_coord]: # Put in left dictionaries if read_end not in intron_dict[intron_str]["left"]: intron_dict[intron_str]["left"][read_end] = 1 else: intron_dict[intron_str]["left"][read_end] += 1 if region_coord in right_region_coord2intron: for intron_str in right_region_coord2intron[region_coord]: # Check right dictionary if read_end not in intron_dict[intron_str]["right"]: intron_dict[intron_str]["right"][read_end] = 1 else: intron_dict[intron_str]["right"][read_end] += 1 # Print output confident_ie_set = set([]) for intron_str in intron_dict: # chr, intron_start_str, intron_end_str = intron_str.split("_") # intron_start = int(intron_start_str) # intron_end = int(intron_end_str) chr, intron_start, intron_end = convertCoordStr(intron_str) # Get left_counts if len(intron_dict[intron_str]["left"]) >= offsets: left_count = getTotalCounts(intron_dict[intron_str]["left"]) confident_ie = "%s:%d-%d" % (chr, intron_start - 1, intron_start) confident_ie_set.add(confident_ie) else: left_count = 0 # Get right counts if len(intron_dict[intron_str]["right"]) >= offsets: right_count = getTotalCounts(intron_dict[intron_str]["right"]) confident_ie = "%s:%d-%d" % (chr, intron_end, intron_end + 1) confident_ie_set.add(confident_ie) else: right_count = 0 if left_count == 0 and right_count == 0: continue print_line = "%s\t%d\t%d\n" % (intron_str, left_count, right_count) out_file.write(print_line) # Now print out confident set of ie for ie in confident_ie_set: confident_ie_file.write("%s\n" % ie) confident_ie_file.close() sys.exit(0)