fasta_for_bowtie_index_name = out_dir + stem + "_SPORK_preprocess_Junctions.fa" gtfs = generate_gtfs(gtf) if use_prior and os.path.isfile(fasta_for_bowtie_index_name): write_time("Using prior jcts: " + fasta_for_bowtie_index_name, time.time(), timer_file_path) else: # Store all five prime mappings by base_read_id (read_id w/out 5_prime or 3_prime) # There should not be two identical base_read_id's id_to_sam_dict = {} with open(five_prime_mapped_name, "r") as five_prime_mapped: sam_line = five_prime_mapped.readline() while sam_line and "@" == sam_line[0]: #Read past the header lines sam_line = five_prime_mapped.readline() while sam_line: sam_entry = SAMEntry(sam_line) base_read_id = sam_entry.read_id.split("/")[0] if base_read_id in id_to_sam_dict: sys.stderr.write( "ERROR: Found duplicate base_read_id in 5_prime mappings\n" ) sys.exit(1) # Filter out the strange chromosomes: (e.g. chrUn_gl000220) if "_" not in sam_entry.chromosome: id_to_sam_dict[base_read_id] = sam_entry sam_line = five_prime_mapped.readline() # Now walk through the three prime mappings creating bin pairs from all shared ids bin_pairs = [] with open(three_prime_mapped_name, "r") as three_prime_mapped: sam_line = three_prime_mapped.readline()