def test_gene_families_tsv_output_with_names(self): """ Test the gene families function and the blast config indexes Test UniRef50_unknown is read in and used for gene scores but not printed Test the tsv output Test that gene families have names applied to them Test unmapped reads total is written with the same precision as other lines """ # update the max decimals to allow for rounding config.output_max_decimals = 7 # set to a smaller mapping file original_gene_family_mapping_file = config.gene_family_name_mapping_file config.gene_family_name_mapping_file = cfg.gene_families_to_names_file # create a set of alignments alignments = store.Alignments() # load the usearch output file_handle = open(cfg.usearch_uniref50_file) for line in file_handle: if not re.search("^#", line): data = line.strip().split(config.blast_delimiter) referenceids = data[config.blast_reference_index].split("|") queryid = data[config.blast_query_index] identity = float(data[config.blast_identity_index]) alignments.add(referenceids[1], 1, queryid, identity, referenceids[0]) file_handle.close() # set the output format config.output_format = "tsv" # set the location of the file to write to as a temp file file_out, gene_families_file = tempfile.mkstemp() os.close(file_out) config.genefamilies_file = gene_families_file # create gene_scores instance gene_scores = store.GeneScores() # obtain the gene families gene_families_file = families.gene_families(alignments, gene_scores, 1) # check the gene families output is as expected self.assertTrue( filecmp.cmp(gene_families_file, cfg.gene_familes_uniref50_with_names_file, shallow=False)) # reset the mapping file config.gene_family_name_mapping_file = original_gene_family_mapping_file # delete the temp file utils.remove_temp_file(gene_families_file)
def test_compute_gene_abundance_in_pathways_without_reactions_database( self): """ Test the compute gene abundance function Test the GeneScores add function Test without a reactions database (the pathways database is composed of genes) """ gene_scores = store.GeneScores() # Add gene scores for two bugs reactions_in_pathways_present = {} bug = "bug1" gene_scores.add_single_score(bug, "gene1", 1) gene_scores.add_single_score(bug, "gene2", 2) gene_scores.add_single_score(bug, "gene4", 4) reactions_in_pathways_present[bug] = ["gene1", "gene2"] bug = "bug2" # Test with different values of gene1 for each bug gene_scores.add_single_score(bug, "gene1", 1.1) gene_scores.add_single_score(bug, "gene7", 7) gene_scores.add_single_score(bug, "gene6", 6) reactions_in_pathways_present[bug] = ["gene6"] reactions_database = None gene_abundance_in_pathways, remaining_gene_abundance = modules.compute_gene_abundance_in_pathways( gene_scores, reactions_database, reactions_in_pathways_present) # Check the gene abundances in pathways are correct self.assertEqual(gene_abundance_in_pathways["bug1"], 3) self.assertEqual(gene_abundance_in_pathways["bug2"], 6) # Check the gene abundances not in pathways are correct self.assertEqual(remaining_gene_abundance["bug1"], 4) self.assertAlmostEqual(remaining_gene_abundance["bug2"], 8.1)
def test_pathways_abundance_with_names(self): """ Test the pathways abundance computation (xipe and minpath are off) Test the pathways print function Test the pathways mapping to names Test the unmapped and unintegrated values are printed """ # update the max decimals to allow for rounding config.output_max_decimals = 7 # Load in the pathways databases reactions_database = store.ReactionsDatabase( config.pathways_database_part1) pathways_database = store.PathwaysDatabase( config.pathways_database_part2, reactions_database) # Load in the gene scores from the file # This file has the gene names included gene_scores = store.GeneScores() gene_scores.add_from_file( cfg.larger_gene_families_uniref50_with_names_file) # Turn off xipe and minpath minpath_toggle_original = config.minpath_toggle config.minpath_toggle = "off" xipe_toggle_original = config.xipe_toggle config.xipe_toggle = "off" pathways_and_reactions_store = modules.identify_reactions_and_pathways( gene_scores, reactions_database, pathways_database) # set the locations to write as temp files file_out, abundance_file = tempfile.mkstemp() os.close(file_out) config.pathabundance_file = abundance_file file_out, coverage_file = tempfile.mkstemp() os.close(file_out) config.pathcoverage_file = coverage_file unaligned_reads_count = 10 abundance_file, coverage_file = modules.compute_pathways_abundance_and_coverage( gene_scores, reactions_database, pathways_and_reactions_store, pathways_database, unaligned_reads_count) # Reset xipe and minpath config.minpath_toggle = minpath_toggle_original config.xipe_toggle = xipe_toggle_original # check the output is as expected self.assertTrue( filecmp.cmp(abundance_file, cfg.demo_pathabundance_file, shallow=False)) utils.remove_temp_file(abundance_file) utils.remove_temp_file(coverage_file)
def test_Alignments_compute_gene_scores_double_gene_double_query_with_temp_alignment_file( self): """ Test the compute_gene_scores function Test two hits to gene with more than one hit per query Test with the temp alignment file """ # create a set of hits # bug, reference, reference_length, query, matches = hit matches1 = 41.0 matches2 = 57.1 matches3 = 61.0 matches4 = 72.1 gene1_length = 2 gene2_length = 3 gene3_length = 4 # Create a set of alignments alignments_store = store.Alignments(minimize_memory_use=True) alignments_store.add("gene1", gene1_length, "query1", matches1, "bug1") alignments_store.add("gene2", gene2_length, "query1", matches2, "bug1") alignments_store.add("gene2", gene2_length, "query2", matches3, "bug1") alignments_store.add("gene3", gene3_length, "query3", matches4, "bug1") gene_scores_store = store.GeneScores() # compute gene scores alignments_store.convert_alignments_to_gene_scores(gene_scores_store) # gene1 hit1_score = math.pow(matches1, config.match_power) hit2_score = math.pow(matches2, config.match_power) query1_sum = hit1_score + hit2_score # convert lengths to per kb gene2_length = gene2_length / 1000.0 # gene2 hit3_score = math.pow(matches3, config.match_power) query2_sum = hit3_score expected_gene_score = hit3_score / query2_sum / gene2_length + hit2_score / query1_sum / gene2_length actual_gene_score = gene_scores_store.get_score("bug1", "gene2") # delete the temp alignment file alignments_store.delete_temp_alignments_file() self.assertAlmostEqual(actual_gene_score, expected_gene_score, places=7)
def test_GeneScores_add_from_file_id_mapping_bug_list(self): """ GeneScores class: Test add_from_file bug list with id mapping """ gene_scores = store.GeneScores() gene_scores.add_from_file( cfg.genetable_file, id_mapping_file=cfg.id_mapping_gene_table_file) # Test the bug list is as expected self.assertEqual( sorted(cfg.genetable_file_bug_scores_id_mapping.keys()), sorted(gene_scores.bug_list()))
def test_compute_gene_abundance_in_pathways_with_reactions_database(self): """ Test the compute gene abundance function Test the GeneScores add function Test the ReactionsDatabase add function Test with a reactions database (the pathways database is composed of reactions and these reactions map to genes, with some genes mapping to multiple reactions) """ gene_scores = store.GeneScores() # Add gene scores for two bugs reactions_in_pathways_present = {} bug = "bug1" gene_scores.add_single_score(bug, "gene1", 1) gene_scores.add_single_score(bug, "gene2", 2) gene_scores.add_single_score(bug, "gene4", 4) bug = "bug2" # Test with different values of gene1 for each bug gene_scores.add_single_score(bug, "gene1", 1.1) gene_scores.add_single_score(bug, "gene7", 7) gene_scores.add_single_score(bug, "gene6", 6) gene_scores.add_single_score(bug, "gene8", 0.2) reactions_database = store.ReactionsDatabase() reactions = { "reaction1": ["gene1", "gene6"], "reaction2": ["gene1", "gene2"], "reaction3": ["gene4", "gene7"], "reaction4": ["gene8"] } reactions_database.add_reactions(reactions) # Test one bug with two reactions and one bug with a single reaction # For the bug with two reactions, test with both reactions including # The same gene (to test this value is not added twice in the abundance result) reactions_in_pathways_present["bug1"] = ["reaction1", "reaction2"] reactions_in_pathways_present["bug2"] = ["reaction1"] gene_abundance_in_pathways, remaining_gene_abundance = modules.compute_gene_abundance_in_pathways( gene_scores, reactions_database, reactions_in_pathways_present) # Check the gene abundances in pathways are correct self.assertEqual(gene_abundance_in_pathways["bug1"], 3) self.assertAlmostEqual(gene_abundance_in_pathways["bug2"], 7.1) # Check the gene abundances not in pathways are correct self.assertEqual(remaining_gene_abundance["bug1"], 4) self.assertAlmostEqual(remaining_gene_abundance["bug2"], 7.2)
def test_GeneScores_add_from_file_id_mapping_gene_list(self): """ GeneScores class: Test add_from_file gene list with id mapping """ gene_scores = store.GeneScores() gene_scores.add_from_file( cfg.genetable_file, id_mapping_file=cfg.id_mapping_gene_table_file) # Create a list of all of the genes in the table genes = {} for bug in cfg.genetable_file_bug_scores_id_mapping: for gene in cfg.genetable_file_bug_scores_id_mapping[bug]: genes[gene] = 1 # Test the gene list is as expected self.assertEqual(sorted(genes.keys()), sorted(gene_scores.gene_list()))
def test_Alignments_compute_gene_scores_single_gene_single_query_with_temp_alignment_file( self): """ Test the compute_gene_scores function Test one hit for gene with one hit for query Test with the temp alignment file """ # create a set of hits matches1 = 41.0 matches2 = 57.1 matches3 = 61.0 matches4 = 72.1 gene1_length = 2 gene2_length = 3 gene3_length = 4 # Create a set of alignments alignments_store = store.Alignments(minimize_memory_use=True) alignments_store.add("gene1", gene1_length, "query1", matches1, "bug1") alignments_store.add("gene2", gene2_length, "query1", matches2, "bug1") alignments_store.add("gene2", gene2_length, "query2", matches3, "bug1") alignments_store.add("gene3", gene3_length, "query3", matches4, "bug1") gene_scores_store = store.GeneScores() # compute gene scores alignments_store.convert_alignments_to_gene_scores(gene_scores_store) # convert lengths to per kb gene3_length = gene3_length / 1000.0 # gene3 hit4_score = math.pow(matches4, config.match_power) query3_sum = hit4_score expected_gene_score = hit4_score / query3_sum / gene3_length actual_gene_score = gene_scores_store.get_score("bug1", "gene3") # delete the temp alignment file alignments_store.delete_temp_alignments_file() self.assertEqual(actual_gene_score, expected_gene_score)
def test_Alignments_compute_gene_scores_single_gene_double_query(self): """ Test the compute_gene_scores function Test one hit for gene with more than one hit per query """ # create a set of hits # bug, reference, reference_length, query, matches = hit matches1 = 41.0 matches2 = 57.1 matches3 = 61.0 matches4 = 72.1 gene1_length = 2 gene2_length = 3 gene3_length = 4 # Create a set of alignments alignments_store = store.Alignments() alignments_store.add("gene1", gene1_length, "query1", matches1, "bug1") alignments_store.add("gene2", gene2_length, "query1", matches2, "bug1") alignments_store.add("gene2", gene2_length, "query2", matches3, "bug1") alignments_store.add("gene3", gene3_length, "query3", matches4, "bug1") gene_scores_store = store.GeneScores() # compute gene scores alignments_store.convert_alignments_to_gene_scores(gene_scores_store) # convert lengths to per kb gene1_length = gene1_length / 1000.0 # gene1 hit1_score = math.pow(matches1, config.match_power) hit2_score = math.pow(matches2, config.match_power) query1_sum = hit1_score + hit2_score gene_score = hit1_score / query1_sum / gene1_length self.assertEqual(gene_scores_store.get_score("bug1", "gene1"), gene_score)