def process_bam(insertion_type, bam, index): if insertion_type.lower() == "melt": input_list = melt_parser(index) elif insertion_type.lower() == "tipseqhunter": input_list = tipseq_parser(index) else: raise TypeError( "{} is not recognized; Should be MELT or TIPseqHunter".format( insertion_type)) logging.info("### Processing BAM File ###") bam_handler = BamHandler(bam) insertions = [] reads_dictionary = ReadsDict() logging.info(" - Finding Reads in Target Regions") for parsed_input in input_list: current_insertion = Insertion(parsed_input) reads_in_region = bam_handler.fetch_reads_in_region(current_insertion) reads_dictionary += reads_in_region current_insertion.read_keys_in_target_region = reads_in_region.keys() insertions.append(current_insertion) logging.info(" --- DONE ---") logging.info(" - Finding Paired Ends") longest_read = 0 for read in bam_handler.all_reads(): longest_read = max(read.query_length, longest_read) if read.query_name in reads_dictionary: reads_dictionary.insert(read) logging.info(" --- DONE ---") logging.info("### Processing Insertions ###") insertions.sort(key=lambda x: x.chromosome) return insertions, reads_dictionary, longest_read
def main(args): group1 = args.group1 group2 = args.group2 sample_id = args.sample_id bam_path = args.bam insertion_list_path = args.index me_ref_path = args.me_reference host_ref_path = args.host_reference genes_file_path = args.genes keep_files = args.keep_files # config = get_config_from_file() # TODO - allow for multiple labels # - eg : pos, unlabeled - pos - negative, pos # TODO - make the reference subdirectories using the writer class check_paths( me_ref_path, host_ref_path, bam_path, insertion_list_path, genes_file_path, ) output_folder_path = os.path.join(os.getcwd(), "output") (reference_path, transposcope_path, track_path,) = create_output_folder_structure( output_folder_path, group1, group2, sample_id ) setup_logging() logging.info("*** TranspoScope ***") logging.info("### Input ###") logging.info(" - Index File Path: {}".format(os.path.abspath(insertion_list_path))) logging.info(" - BAM File Path: {}".format(os.path.abspath(bam_path))) logging.info( " - Mobile Element Reference File Path: {}".format(os.path.abspath(me_ref_path)) ) logging.info( " - Host Genome Folder Path: {}".format(os.path.abspath(host_ref_path)) ) logging.info( " - refFlat.txt Path: {}".format( os.path.abspath(genes_file_path) if genes_file_path else "undefined" ) ) logging.info(" - Group 1: {}".format(group1)) logging.info(" - Group 2: {}".format(group2)) logging.info(" - Keep Intermediate Files: {}".format(keep_files)) insertion_sites_reader = InsertionSiteReader(insertion_list_path) logging.info("### Processing BAM File ###") bam_handler = BamHandler(bam_path) fasta_handler = FastaHandler(me_ref_path, host_ref_path) insertions = [] reads_dictionary = {True: ReadsDict(), False: ReadsDict()} logging.info(" - Finding Reads in Target Regions") for insertion_stats in insertion_sites_reader.read_lines(): temp_insertion = Insertion(named_tuple=insertion_stats) reads_in_region = bam_handler.fetch_reads_in_region(temp_insertion) reads_dictionary[temp_insertion.THREE_PRIME] += reads_in_region temp_insertion.read_keys_in_target_region = reads_in_region.keys() insertions.append(temp_insertion) logging.info(" --- DONE ---") logging.info(" - Finding Paired Ends") for read in bam_handler.all_reads(): if read.query_name in reads_dictionary[True]: reads_dictionary[True].insert(read) if read.query_name in reads_dictionary[False]: reads_dictionary[False].insert(read) logging.info(" --- DONE ---") logging.info("### Processing Insertions ###") file_writer = FileWriter() realigner = Realigner(reference_path) classifier = ReadClassifier(transposcope_path) if genes_file_path: gene_handler = GeneHandler(genes_file_path) insertions.sort(key=lambda x: x.CHROMOSOME) heading_table = {"Heading": ("ID", "Gene", "Probability"), "data": []} ten_percent = len(insertions) / 10 next_log = ten_percent completed = 0 for insertion in insertions: file_name = "{i.CHROMOSOME}_{i.START}-{i.END}".format(i=insertion) insertion.fasta_string = fasta_handler.generate_fasta_sequence(insertion) fasta_path = file_writer.write_fasta( reference_path, file_name, insertion.fasta_string, ">{i.CHROMOSOME}_{i.START}-{i.END}\n".format(i=insertion), ) fastq1_path, fastq2_path = file_writer.write_fastq( reference_path, reads_dictionary[insertion.THREE_PRIME], file_name, insertion.read_keys_in_target_region, ) sorted_bam_path = realigner.realign( fasta_path, fastq1_path, fastq2_path, file_name ) # TODO - add the option to not add gene information if genes_file_path: gene_info = gene_handler.find_nearest_gene( insertion.CHROMOSOME, insertion.INSERTION_SITE ) else: gene_info = ("undefined", "rgb(3, 119, 190)") # heading_table['Data']\ if insertion.THREE_PRIME: end = "3" else: end = "5" heading_table["data"].append( [ "{}-{}({})".format(insertion.CHROMOSOME, insertion.CLIP_START, end), gene_info, "{:.2f}".format(insertion.PRED), ] ) classifier.classify_insertion(insertion, sorted_bam_path) completed += 1 if completed > next_log: logging.info( " - Percentage of insertions processed: {:.2%}.".format( completed / len(insertions) ) ) next_log += ten_percent # TODO - write out index file logging.info(" --- DONE ---") if not keep_files: logging.info("### Cleanup ###") logging.info("Cleaning up generated files in {}".format(reference_path)) if os.path.exists(reference_path): shutil.rmtree(os.path.dirname(reference_path)) logging.info(" --- DONE ---") logging.info("### Building Bed File ###") with open(os.path.join(track_path, "{}.bb".format(sample_id)), "w") as fh: for insertion in insertions: fh.write( "{}\t{}\t{}\n".format( insertion.CHROMOSOME, insertion.CLIP_START, insertion.CLIP_END, ) ) logging.info(" --- DONE ---") logging.info("### Building Website ###") web_dir = os.path.join(os.getcwd(), "web") logging.info(" The website is being built into: {}".format(web_dir)) file_writer.write_json( os.path.join(transposcope_path, "table_info"), heading_table, ) tree, found_table = build_tree(os.path.join(web_dir, "json")) with open(os.path.join(web_dir, "manifest.json"), "w") as json_file: json.dump(tree["json"], json_file) logging.info(" --- DONE ---") logging.info( " to view the generated files in your browser run:\n\t\t'transposcope view {}'".format( web_dir ) )
def test_incremental_add(self): reads_dict = ReadsDict() reads_dict += {"a": 1} reads_dict += {"b": 2, "c": 3, "d": 4} self.assertDictEqual(reads_dict, {"a": 1, "b": 2, "c": 3, "d": 4})
def test_insert_read1_with_empty_list(self): reads_dict = ReadsDict({"a": []}) self.assertRaises(ValueError, reads_dict.insert, Read("a", True, False, 0))
def test_insert_read2_with_both_reads_present(self): reads_dict = ReadsDict( {"a": [Read("a", False, True, 0), Read("a", True, False, 0)]}) result = reads_dict.insert(Read("a", False, True, 0)) self.assertIs(result, -1)
def test_insert_read1_with_read2_present(self): reads_dict = ReadsDict({"a": [Read("a", True, False, 0)]}) result = reads_dict.insert(Read("a", False, True, 0)) self.assertIs(result, 0) self.assertTrue(reads_dict["a"][0].is_read1) self.assertTrue(reads_dict["a"][1].is_read2)
def test_reads_from_key_list(self): reads_dict = ReadsDict() reads_dict += {"a": 1, "b": 2, "c": 3, "d": 4} iterator = reads_dict.reads_from_key_list(["b", "d"]) self.assertEqual(next(iterator), 2) self.assertEqual(next(iterator), 4)