def test_compute_structured_pathway_abundance_or_coverage_test_abundance_with_OR( self): """ Test the compute_structured_pathway_abundance_or_coverage function for abundance Test the PathwaysDatabase add and get pathway structure along with key reactions Test with an OR structure """ # Create the database structure pathways_database_store = store.PathwaysDatabase() structure_string = " A B C ( E , F )" pathways_database_store.add_pathway_structure("pathway1", structure_string) reaction_scores = {"A": 1, "B": 2, "C": 3, "E": 4, "F": 5} structure = pathways_database_store.get_structure_for_pathway( "pathway1") key_reactions = pathways_database_store.get_key_reactions_for_pathway( "pathway1") # Compute the abundance abundance = modules.compute_structured_pathway_abundance_or_coverage( structure, key_reactions, reaction_scores, False, 0) # Compute the expected abundance which is the harmonic mean of the values with the max for the OR or_abundance = max([reaction_scores["E"]] + [reaction_scores["F"]]) del reaction_scores["E"] del reaction_scores["F"] reaction_scores["E_or_F"] = or_abundance expected_abundance = len(reaction_scores.values()) / sum( 1.0 / v for v in reaction_scores.values()) self.assertEqual(abundance, expected_abundance)
def test_PathwaysDatabase_add_pathway_structure_test_key_reactions_not_included_in_reactions_database( self): """ Pathways database class: Test the add pathway structure Test the function with a structure with two starting points that contract Test the key reactions are correct for reactions that are not included in the reactions database Test that key reactions included are correct for reactions with 1 and 3 optional indicators """ # Create a reactions database of a subset of the reactions in the pathways reactions_database_store = store.ReactionsDatabase() reactions = { "A": ["gene1", "gene2"], "---B": ["gene3"], "--Z": ["gene4"], "-F": ["gene5"] } reactions_database_store.add_reactions(reactions) pathways_database_store = store.PathwaysDatabase() structure_string = "( ( L A ---B ) , ( --Z ---C D ) ) -E -F" pathways_database_store.add_pathway_structure( "pathway1", structure_string, reactions_database_store) expected_key_reactions = ["L", "A", "---B", "--Z", "D", "-F"] self.assertEqual( expected_key_reactions, pathways_database_store.get_key_reactions_for_pathway("pathway1"))
def test_compute_structured_pathway_abundance_or_coverage_test_coverage_missing_optional_reaction( self): """ Test the compute_structured_pathway_abundance_or_coverage function for a simple structure with coverage Test the PathwaysDatabase add and get pathway structure along with key reactions Test with an optional reaction missing """ # Create the database structure pathways_database_store = store.PathwaysDatabase() structure_string = " A -B C " pathways_database_store.add_pathway_structure("pathway1", structure_string) reaction_scores = {"A": 1, "B": 0, "C": 3} structure = pathways_database_store.get_structure_for_pathway( "pathway1") key_reactions = pathways_database_store.get_key_reactions_for_pathway( "pathway1") median = 1 # Compute the coverage coverage = modules.compute_structured_pathway_abundance_or_coverage( structure, key_reactions, reaction_scores, True, median) # Compute the expected coverage which is the harmonic mean of the chi2cdf for the required reactions del reaction_scores["B"] expected_coverage = modules.harmonic_mean( [chi2cdf.chi2cdf(v, median) for v in reaction_scores.values()]) self.assertEqual(coverage, expected_coverage)
def test_compute_structured_pathway_abundance_or_coverage_test_coverage_missing_required_reaction( self): """ Test the compute_structured_pathway_abundance_or_coverage function for a simple structure with coverage Test the PathwaysDatabase add and get pathway structure along with key reactions Test with a required reaction missing """ # Create the database structure pathways_database_store = store.PathwaysDatabase() structure_string = " A B C " pathways_database_store.add_pathway_structure("pathway1", structure_string) reaction_scores = {"A": 1, "B": 0, "C": 3} structure = pathways_database_store.get_structure_for_pathway( "pathway1") key_reactions = pathways_database_store.get_key_reactions_for_pathway( "pathway1") median = 1 # Compute the coverage coverage = modules.compute_structured_pathway_abundance_or_coverage( structure, key_reactions, reaction_scores, True, median) # Compute the expected coverage which is the harmonic mean of the chi2cdf # This is zero since one required reaction is missing expected_coverage = 0 self.assertEqual(coverage, expected_coverage)
def test_compute_structured_pathway_abundance_or_coverage_test_abundance( self): """ Test the compute_structured_pathway_abundance_or_coverage function for a simple structure with abundance Test the PathwaysDatabase add and get pathway structure along with key reactions """ # Create the database structure pathways_database_store = store.PathwaysDatabase() structure_string = " A B C " pathways_database_store.add_pathway_structure("pathway1", structure_string) reaction_scores = {"A": 1, "B": 2, "C": 3} structure = pathways_database_store.get_structure_for_pathway( "pathway1") key_reactions = pathways_database_store.get_key_reactions_for_pathway( "pathway1") # Compute the abundance abundance = modules.compute_structured_pathway_abundance_or_coverage( structure, key_reactions, reaction_scores, False, 0) # Compute the expected abundance which is the harmonic mean of the values expected_abundance = len(reaction_scores.values()) / sum( 1.0 / v for v in reaction_scores.values()) self.assertEqual(abundance, expected_abundance)
def test_compute_structured_pathway_abundance_or_coverage_test_abundance_missing_required_reaction( self): """ Test the compute_structured_pathway_abundance_or_coverage function for a simple structure with abundance Test the PathwaysDatabase add and get pathway structure along with key reactions Test with a required reaction missing """ # Create the database structure pathways_database_store = store.PathwaysDatabase() structure_string = " A B C " pathways_database_store.add_pathway_structure("pathway1", structure_string) reaction_scores = {"A": 1, "B": 0, "C": 3} structure = pathways_database_store.get_structure_for_pathway( "pathway1") key_reactions = pathways_database_store.get_key_reactions_for_pathway( "pathway1") # Compute the abundance abundance = modules.compute_structured_pathway_abundance_or_coverage( structure, key_reactions, reaction_scores, False, 0) # Compute the expected abundance which is the harmonic mean of the values that is 0 in the case of a missing reaction expected_abundance = 0 self.assertEqual(abundance, expected_abundance)
def test_compute_pathways_coverage_structured(self): """ Test the compute_pathways_coverage function Test PathwaysDatabase add Test PathwaysAndReactions store Test Pathways store Test with structured pathways """ # Set xipe to off config.xipe_toggle = "off" # Create the database structure pathways_database_store = store.PathwaysDatabase() pathways_database_store.add_pathway_structure("pathway1", " A B C D ") pathways_database_store.add_pathway_structure("pathway2", " A B C D E F ") # Have all test data be from the same bug bug = "bug" pathways_and_reactions_store = store.PathwaysAndReactions() # Just a note that a value of 1 has a chi2cdf value of 0 # Also values ~10 or less have small chi2cdf values pathways_and_reactions_store.add(bug, "A", "pathway1", 11) pathways_and_reactions_store.add(bug, "B", "pathway1", 12) pathways_and_reactions_store.add(bug, "C", "pathway1", 13) pathways_and_reactions_store.add(bug, "D", "pathway1", 14) pathways_and_reactions_store.add(bug, "A", "pathway2", 19) pathways_and_reactions_store.add(bug, "B", "pathway2", 20) pathways_and_reactions_store.add(bug, "C", "pathway2", 30) pathways_and_reactions_store.add(bug, "D", "pathway2", 40) pathways_and_reactions_store.add(bug, "E", "pathway2", 50) pathways_and_reactions_store.add(bug, "F", "pathway2", 60) # Get the coverage result # The median is the median of all of the reactions of all of the pathways for this bug median_score_value = 19.5 # boost the pathway values pathway1_values_boosted = [12, 12, 13, 14] coverage_pathway1 = len(pathway1_values_boosted) / sum( 1.0 / chi2cdf.chi2cdf(v, median_score_value) for v in pathway1_values_boosted) pathway2_values_boosted = [20, 20, 30, 40, 50, 60] coverage_pathway2 = len(pathway2_values_boosted) / sum( 1.0 / chi2cdf.chi2cdf(v, median_score_value) for v in pathway2_values_boosted) # Find the actual result pathways_abundance_store_result = modules.compute_pathways_coverage( pathways_and_reactions_store, pathways_database_store) # Test the pathways abundance match those expected self.assertEqual( pathways_abundance_store_result.get_score_for_bug(bug, "pathway1"), coverage_pathway1) self.assertEqual( pathways_abundance_store_result.get_score_for_bug(bug, "pathway2"), coverage_pathway2)
def test_pathways_abundance_with_names(self): """ Test the pathways abundance computation (xipe and minpath are off) Test the pathways print function Test the pathways mapping to names Test the unmapped and unintegrated values are printed """ # update the max decimals to allow for rounding config.output_max_decimals = 7 # Load in the pathways databases reactions_database = store.ReactionsDatabase( config.pathways_database_part1) pathways_database = store.PathwaysDatabase( config.pathways_database_part2, reactions_database) # Load in the gene scores from the file # This file has the gene names included gene_scores = store.GeneScores() gene_scores.add_from_file( cfg.larger_gene_families_uniref50_with_names_file) # Turn off xipe and minpath minpath_toggle_original = config.minpath_toggle config.minpath_toggle = "off" xipe_toggle_original = config.xipe_toggle config.xipe_toggle = "off" pathways_and_reactions_store = modules.identify_reactions_and_pathways( gene_scores, reactions_database, pathways_database) # set the locations to write as temp files file_out, abundance_file = tempfile.mkstemp() os.close(file_out) config.pathabundance_file = abundance_file file_out, coverage_file = tempfile.mkstemp() os.close(file_out) config.pathcoverage_file = coverage_file unaligned_reads_count = 10 abundance_file, coverage_file = modules.compute_pathways_abundance_and_coverage( gene_scores, reactions_database, pathways_and_reactions_store, pathways_database, unaligned_reads_count) # Reset xipe and minpath config.minpath_toggle = minpath_toggle_original config.xipe_toggle = xipe_toggle_original # check the output is as expected self.assertTrue( filecmp.cmp(abundance_file, cfg.demo_pathabundance_file, shallow=False)) utils.remove_temp_file(abundance_file) utils.remove_temp_file(coverage_file)
def test_PathwaysDatabase_is_structured_structure(self): """ Pathways database class: Test the storing of a structured set of pathways Test this file is identified as structured """ pathways_database_store = store.PathwaysDatabase(cfg.pathways_file) self.assertTrue(pathways_database_store.is_structured())
def test_compute_pathways_coverage_unstructured(self): """ Test the compute_pathways_coaverage function Test PathwaysDatabase add Test PathwaysAndReactions store Test Pathways store Test with unstructured pathways """ # Set xipe to off config.xipe_toggle = "off" # Create the database structure pathways_database_store = store.PathwaysDatabase() pathways_database_store.add_pathway("pathway1", ["A", "B", "C", "D"]) pathways_database_store.add_pathway("pathway2", ["A", "B", "C", "D", "E", "F"]) # Have all test data be from the same bug bug = "bug" pathways_and_reactions_store = store.PathwaysAndReactions() pathways_and_reactions_store.add(bug, "A", "pathway1", 1) # Note B is not recored for pathway1 which will result in a zero value pathways_and_reactions_store.add(bug, "C", "pathway1", 3) pathways_and_reactions_store.add(bug, "D", "pathway1", 40) pathways_and_reactions_store.add(bug, "A", "pathway2", 10) pathways_and_reactions_store.add(bug, "B", "pathway2", 20) # Note C is not recored for pathway2 which will result in a zero value pathways_and_reactions_store.add(bug, "D", "pathway2", 40) pathways_and_reactions_store.add(bug, "E", "pathway2", 50) # Note F is not recored for pathway2 which will result in a zero value # The coverage for each pathway if the number of reactions greater than the median # divided by the total reactions in the pathway # The median for this set is 20 pathway1_values = [0, 1, 3, 40] count_greater_than_median = 1 pathway1_coverage = count_greater_than_median / float( len(pathway1_values)) pathway2_values = [0, 0, 10, 20, 40, 50] count_greater_than_median = 2 pathway2_coverage = count_greater_than_median / float( len(pathway2_values)) pathways_coverage_store_result = modules.compute_pathways_coverage( pathways_and_reactions_store, pathways_database_store) # Test the pathways abundance match those expected self.assertEqual( pathways_coverage_store_result.get_score_for_bug(bug, "pathway1"), pathway1_coverage) self.assertEqual( pathways_coverage_store_result.get_score_for_bug(bug, "pathway2"), pathway2_coverage)
def test_compute_pathways_abundance_unstructured(self): """ Test the compute_pathways_abundance function Test PathwaysDatabase add Test PathwaysAndReactions store Test Pathways store Test with unstructured pathways """ # Create the database structure pathways_database_store = store.PathwaysDatabase() pathways_database_store.add_pathway("pathway1", ["A", "B", "C", "D"]) pathways_database_store.add_pathway("pathway2", ["A", "B", "C", "D", "E", "F"]) # Have all test data be from the same bug bug = "bug" pathways_and_reactions_store = store.PathwaysAndReactions() pathways_and_reactions_store.add(bug, "A", "pathway1", 1) # Note B is not recored for pathway1 which will result in a zero value pathways_and_reactions_store.add(bug, "C", "pathway1", 3) pathways_and_reactions_store.add(bug, "D", "pathway1", 4) pathways_and_reactions_store.add(bug, "A", "pathway2", 10) pathways_and_reactions_store.add(bug, "B", "pathway2", 20) # Note C is not recored for pathway2 which will result in a zero value pathways_and_reactions_store.add(bug, "D", "pathway2", 40) pathways_and_reactions_store.add(bug, "E", "pathway2", 50) # Note F is not recored for pathway2 which will result in a zero value # The abundance for each pathway is the average of the largest half of the reaction values # For unstructured pathways, if the reaction is not included it does not result in a zero abundance pathway1_values = [0, 1, 3, 4] pathway1_abundance_set = pathway1_values[int(len(pathway1_values) / 2):] pathway1_abundance = sum(pathway1_abundance_set) / len( pathway1_abundance_set) pathway2_values = [0, 0, 10, 20, 40, 50] pathway2_abundance_set = pathway2_values[int(len(pathway2_values) / 2):] pathway2_abundance = sum(pathway2_abundance_set) / len( pathway2_abundance_set) pathways_abundance_store_result, reactions_in_pathways_present = modules.compute_pathways_abundance( pathways_and_reactions_store, pathways_database_store) # Test the pathways abundance match those expected self.assertEqual( pathways_abundance_store_result.get_score_for_bug(bug, "pathway1"), pathway1_abundance) self.assertEqual( pathways_abundance_store_result.get_score_for_bug(bug, "pathway2"), pathway2_abundance)
def test_compute_pathways_abundance_structured(self): """ Test the compute_pathways_abundance function Test PathwaysDatabase add Test PathwaysAndReactions store Test Pathways store Test with structured pathways """ # Create the database structure pathways_database_store = store.PathwaysDatabase() pathways_database_store.add_pathway_structure("pathway1", " A B C D ") pathways_database_store.add_pathway_structure("pathway2", " A B C D E F ") # Have all test data be from the same bug bug = "bug" pathways_and_reactions_store = store.PathwaysAndReactions() # Just a note that a value of 1 has a chi2cdf value of 0 # Also values ~10 or less have small chi2cdf values pathways_and_reactions_store.add(bug, "A", "pathway1", 11) pathways_and_reactions_store.add(bug, "B", "pathway1", 12) pathways_and_reactions_store.add(bug, "C", "pathway1", 13) pathways_and_reactions_store.add(bug, "D", "pathway1", 14) pathways_and_reactions_store.add(bug, "A", "pathway2", 19) pathways_and_reactions_store.add(bug, "B", "pathway2", 20) pathways_and_reactions_store.add(bug, "C", "pathway2", 30) pathways_and_reactions_store.add(bug, "D", "pathway2", 40) pathways_and_reactions_store.add(bug, "E", "pathway2", 50) pathways_and_reactions_store.add(bug, "F", "pathway2", 60) # The abundance for each pathway is the harmonic mean of the values # boost the lowest value in the pathway pathway1_values_boosted = [12, 12, 13, 14] pathway1_abundance = len(pathway1_values_boosted) / sum( 1.0 / v for v in pathway1_values_boosted) pathway2_values_boosted = [20, 20, 30, 40, 50, 60] pathway2_abundance = len(pathway2_values_boosted) / sum( 1.0 / v for v in pathway2_values_boosted) # Find the actual result pathways_abundance_store_result, reactions_in_pathways_present = modules.compute_pathways_abundance( pathways_and_reactions_store, pathways_database_store) # Test the pathways abundance match those expected self.assertEqual( pathways_abundance_store_result.get_score_for_bug(bug, "pathway1"), pathway1_abundance) self.assertEqual( pathways_abundance_store_result.get_score_for_bug(bug, "pathway2"), pathway2_abundance)
def test_compute_pathways_abundance_unstructured_reactions_list(self): """ Test the compute_pathways_abundance function Test PathwaysDatabase add Test PathwaysAndReactions store Test Pathways store Test with unstructured pathways Test the resulting list of reactions included in pathways """ # Create the database structure pathways_database_store = store.PathwaysDatabase() pathways_database_store.add_pathway("pathway1", ["A", "B", "C", "D"]) pathways_database_store.add_pathway("pathway2", ["A", "B", "C", "D", "E", "F"]) pathways_database_store.add_pathway("pathway3", ["A", "B", "G"]) # Have all test data be from two bugs bug = "bug1" pathways_and_reactions_store = store.PathwaysAndReactions() pathways_and_reactions_store.add(bug, "A", "pathway1", 1) pathways_and_reactions_store.add(bug, "B", "pathway1", 2) pathways_and_reactions_store.add(bug, "B", "pathway2", 2) pathways_and_reactions_store.add(bug, "C", "pathway2", 3) expected_reactions_in_pathways_present = {} expected_reactions_in_pathways_present[bug] = ["A", "B", "C"] bug = "bug2" pathways_and_reactions_store.add(bug, "A", "pathway1", 1) pathways_and_reactions_store.add(bug, "D", "pathway1", 3) pathways_and_reactions_store.add(bug, "B", "pathway1", 2) pathways_and_reactions_store.add(bug, "B", "pathway2", 2) expected_reactions_in_pathways_present[bug] = ["A", "B", "D"] pathways_abundance_store_result, reactions_in_pathways_present = modules.compute_pathways_abundance( pathways_and_reactions_store, pathways_database_store) # Test the reactions match those expected self.assertEqual( sorted(expected_reactions_in_pathways_present["bug1"]), sorted(list(reactions_in_pathways_present["bug1"]))) self.assertEqual( sorted(expected_reactions_in_pathways_present["bug2"]), sorted(list(reactions_in_pathways_present["bug2"])))
def main(): args = parse_arguments(sys) # read in the gene families to reactions database print("Reading gene families to reactions database") reactions_database = store.ReactionsDatabase( config.pathways_database_part1) # read in the reactions to pathways database print("Reading reactions to pathways database") pathways_database = store.PathwaysDatabase(config.pathways_database_part2, reactions_database) # read in the species and pathways selected print("Finding gene families for each species for the pathways selected") genefamilies = {} reactions = {} for line in open(args.input_selection): # remove starting and ending spaces line = line.strip() if not line.startswith("#"): try: species, pathway = line.split("\t") except IndexError: print("Warning: Skipping selection line because of format: " + line) continue if not species in genefamilies: genefamilies[species] = set() reactions[species] = set() # get the reactions and then gene families for the pathway if pathway.upper() == "ALL": genefamilies[species] = "ALL" reactions[species] = "ALL" else: for reaction in pathways_database.find_reactions(pathway): reactions[species].add(reaction) genefamilies[species].update( list( filter(lambda x: x.startswith(args.gene_families), reactions_database.find_genes(reaction)))) # get the set of gene families in any pathway all_genefamilies_in_pathways = set() for reaction in pathways_database.reaction_list(): all_genefamilies_in_pathways.update( list( filter(lambda x: x.startswith(args.gene_families), reactions_database.find_genes(reaction)))) # open the output file try: file_handle = open(args.output, "w") except EnvironmentError: sys.exit("ERROR: Unable to open output file: " + args.output) # find the reads for the species and pathways selected print("Reading sam file") selected_reads = set() reads_to_gene_familes = {} reaction_totals = {} # open a fasta file to write the species specific sequences for input to metaphlan if args.add_markers: species_fasta_file = args.output + ".species_specific_reads.fasta" species_fasta_file_handle = open(species_fasta_file, "w") selected_reads_to_species = {} for species, gene_family, read_name, sequence, quality_scores in read_sam( args.input_sam, args.gene_families): # record the reads that align to species/gene families requested requested_genes_for_species = genefamilies.get(species, []) if (gene_family in requested_genes_for_species) or ( "ALL" in requested_genes_for_species) or ( args.add_unintegrated and not (gene_family in all_genefamilies_in_pathways)): selected_reads.add(read_name) selected_reads_to_species[species] = selected_reads_to_species.get( species, 0) + 1 if not read_name in reads_to_gene_familes: reads_to_gene_familes[read_name] = set() reads_to_gene_familes[read_name].add(gene_family) # add to the reaction totals count for reaction in reactions_database.find_reactions(gene_family): reaction_totals[reaction] = reaction_totals.get(reaction, 0) + 1.0 # if this is a species in the list, and we are adding markers, write this to the fasta file for input to metaphlan if args.add_markers and species in genefamilies.keys(): write_sequence(species_fasta_file_handle, read_name, sequence, quality_scores, "fasta") print("Total reads per species based on gene families") for species, count in selected_reads_to_species.items(): print(species + "\t" + str(count)) # close the species fasta file handle if args.add_markers: species_fasta_file_handle.close() print("Total reads found: " + str(len(selected_reads))) # get the markers reads from the species fasta file marker_reads_to_add = set() read_to_species_marker = {} read_to_marker_name = {} all_reads_mapping_to_markers = set() if args.add_markers: print("Finding reads mapped to markers with MetaPhlAn") print("Running MetaPhlAn") metaphlan_marker_file = species_fasta_file + ".marker_alignments.tsv" metaphlan_bowtie2_file = species_fasta_file + ".bowtie2.tsv" try: # remove the bowtie2 output file if it exists to prevent metaphlan error os.remove(metaphlan_bowtie2_file) except EnvironmentError: pass output = subprocess.check_output([ "metaphlan", "--input_type", "fasta", species_fasta_file, "-t", "reads_map", "-o", metaphlan_marker_file, "--bowtie2out", metaphlan_bowtie2_file ]) # read through the file to identify the maker reads to add for line in open(metaphlan_marker_file): if not line.startswith("#"): read_name, taxon = line.rstrip().split("\t") species = taxon.split("|")[-1] if species in genefamilies.keys(): marker_reads_to_add.add(read_name) if not species in read_to_species_marker: read_to_species_marker[species] = set() read_to_species_marker[species].add(read_name) all_reads_mapping_to_markers.add(read_name) # read through the file to identify the name of the marker the reads map to for line in open(metaphlan_bowtie2_file): try: read_name, marker_name = line.rstrip().split("\t") except IndexError: continue if read_name in all_reads_mapping_to_markers: read_to_marker_name[read_name] = marker_name print("Found a total of " + str(len(marker_reads_to_add)) + " reads to species markers") if args.percent < 100: print("Filtering reads by percent requested: " + str(args.percent)) filtered_reads = set() current_reaction_totals = { reaction: 0 for reaction in reaction_totals.keys() } # go through the reads, adding until there is enough in the reaction list # to hit the percent requested for read_name in selected_reads: # check if this read is needed into increase the reaction percents add_read = False for gene in reads_to_gene_familes[read_name]: for reaction in reactions_database.find_reactions(gene): if (current_reaction_totals[reaction] / reaction_totals[reaction]) * 100 < args.percent: add_read = True if add_read: filtered_reads.add(read_name) # update the current reaction counts for gene in reads_to_gene_familes[read_name]: for reaction in reactions_database.find_reactions(gene): current_reaction_totals[reaction] += 1 # update the selected reads to those that are filtered selected_reads = filtered_reads print("Total reads after filtering: " + str(len(selected_reads))) # check to make sure at least 200 marker reads (or command line setting) for each species are present # make sure there reads are spread over the markers for the sample if args.add_markers: # for each species, check the number of maker reads present print( "Counting markers in set to determine if reads need to be added to meet min markers" ) for species, reads_for_species in read_to_species_marker.items(): # count how many of the reads are in the selected reads list overlap = selected_reads.intersection(reads_for_species) total_overlap = len(list(overlap)) if total_overlap < args.min_markers: # add in more reads to hit the min markers setting for this species max_reads_to_add = list( reads_for_species.difference(selected_reads)) # group the reads based on the species markers to_add_by_markers = {} for read_name in max_reads_to_add: marker_for_read = read_to_marker_name[read_name] if not marker_for_read in to_add_by_markers: to_add_by_markers[marker_for_read] = set() to_add_by_markers[marker_for_read].add(read_name) selected_reads_by_markers = {} for read_name in selected_reads: try: marker_for_read = read_to_marker_name[read_name] except KeyError: # ignore reads that do not map to markers continue if not marker_for_read in selected_reads_by_markers: selected_reads_by_markers[marker_for_read] = set() selected_reads_by_markers[marker_for_read].add(read_name) # count the total reads to add total_to_add = args.min_markers - total_overlap total_added = 0 # get a list of all of the markers for the species all_species_markers = set(to_add_by_markers.keys()) all_species_markers.update(selected_reads_by_markers.keys()) # start with the markers not already included, adding one read to each for marker_name in all_species_markers.difference( selected_reads_by_markers.keys()): total_markers_in_set = len( list(to_add_by_markers[marker_name])) end_index = args.min_reads_per_marker if total_markers_in_set >= args.min_reads_per_marker else total_markers_in_set # add at most min reads per markers for this marker set to the set of selected reads to_add = list(to_add_by_markers[marker_name])[:end_index] selected_reads.update(to_add) if not marker_name in selected_reads_by_markers: selected_reads_by_markers[marker_name] = set() selected_reads_by_markers[marker_name].update(to_add) # remove the added reads from the set of reads to add to_add_by_markers[marker_name] = to_add_by_markers[ marker_name].difference(to_add) # decrease the total amount to add total_added += end_index print("Adding " + str(end_index) + " total reads to fill empty marker for species " + species) # next add more reads for those sets that already have markers, starting with the smallest # set of markers to the largest marker_counts = { marker: len(list(reads)) for marker, reads in selected_reads_by_markers.items() } for marker_name in sorted(marker_counts, key=marker_counts.get): try: total_markers_in_set = len( list(to_add_by_markers[marker_name])) except KeyError: # ignore errors for markers that are not included in the to add list (only in the selected reads) continue end_index = args.min_reads_per_marker if total_markers_in_set >= args.min_reads_per_marker else total_markers_in_set # add only so many reads to get to min reads per markers for this set end_index = end_index - len( list(selected_reads_by_markers[marker_name])) # add at most min reads per marker reads for this marker set to the set of selected reads if end_index > 0: to_add = list( to_add_by_markers[marker_name])[:end_index] selected_reads.update(to_add) selected_reads_by_markers[marker_name].update(to_add) # remove the added reads from the set of reads to add to_add_by_markers[marker_name] = to_add_by_markers[ marker_name].difference(to_add) total_added += end_index print( "Adding " + str(end_index) + " total reads to fill slightly full marker for species " + species) # add more reads to get to the min reads added value total_to_add = total_to_add - total_added if total_added < total_to_add else 0 try: selected_reads.update(max_reads_to_add[:total_to_add]) total_added += total_to_add print("Added " + str(total_to_add) + " reads for species " + species + " to meet min markers") except (TypeError, IndexError): continue print("Total reads added for species " + species + " :" + str(total_added)) print("Total reads after counting markers: " + str(len(selected_reads))) # determine how many trimmable reads to write total_trimmable = 0 trimmable = [] if args.add_trimmable: total_trimmable = int(random.uniform(50, 100)) print("Adding " + str(total_trimmable) + " total trimmable reads") print("Writing output file") for species, gene_family, read_name, sequence, quality_scores in read_sam( args.input_sam, args.gene_families): # write the read sequences requested once if read_name in selected_reads: write_sequence(file_handle, read_name, sequence, quality_scores, args.output_format) selected_reads.remove(read_name) if len(trimmable) < total_trimmable: trimmable.append([read_name, sequence, quality_scores]) # write the trimmable reads for read_name, sequence, quality_scores in trimmable: new_length = int(len(sequence) / 2.0) write_sequence(file_handle, "trimmable_" + read_name, sequence[0:new_length], quality_scores[0:new_length], args.output_format) print("Output file written: " + args.output)
def test_compute_pathways_abundance_structured_reactions_list(self): """ Test the compute_pathways_abundance function Test PathwaysDatabase add Test PathwaysAndReactions store Test Pathways store Test with structured pathways Test gap fill Test the resulting list of reactions included in pathways """ # Create the database structure pathways_database_store = store.PathwaysDatabase() pathways_database_store.add_pathway_structure("pathway1", " A B C D ") pathways_database_store.add_pathway_structure("pathway2", " A B C D E F ") pathways_database_store.add_pathway_structure("pathway3", " A B G") # Have all test data be from three bugs bug = "bug1" pathways_and_reactions_store = store.PathwaysAndReactions() pathways_and_reactions_store.add(bug, "A", "pathway1", 1) pathways_and_reactions_store.add(bug, "B", "pathway1", 2) pathways_and_reactions_store.add(bug, "C", "pathway1", 3) expected_reactions_in_pathways_present = {} # This pathway is present because D is filled in # Though D does not have abundance so it is not included in the list expected_reactions_in_pathways_present[bug] = ["A", "B", "C"] bug = "bug2" pathways_and_reactions_store.add(bug, "A", "pathway1", 1) pathways_and_reactions_store.add(bug, "B", "pathway1", 3) pathways_and_reactions_store.add(bug, "D", "pathway2", 2) pathways_and_reactions_store.add(bug, "B", "pathway2", 2) # The pathways for this bug are missing too many reactions to have abundance expected_reactions_in_pathways_present[bug] = [] bug = "bug3" pathways_and_reactions_store.add(bug, "A", "pathway3", 1) pathways_and_reactions_store.add(bug, "B", "pathway3", 3) pathways_and_reactions_store.add(bug, "G", "pathway3", 2) pathways_and_reactions_store.add(bug, "B", "pathway2", 2) # One pathway for this bug includes all reactions expected_reactions_in_pathways_present[bug] = ["A", "B", "G"] pathways_abundance_store_result, reactions_in_pathways_present = modules.compute_pathways_abundance( pathways_and_reactions_store, pathways_database_store) # Test the reactions match those expected self.assertEqual( sorted(expected_reactions_in_pathways_present["bug1"]), sorted(list(reactions_in_pathways_present["bug1"]))) self.assertEqual( sorted(expected_reactions_in_pathways_present["bug2"]), sorted(list(reactions_in_pathways_present["bug2"]))) self.assertEqual( sorted(expected_reactions_in_pathways_present["bug3"]), sorted(list(reactions_in_pathways_present["bug3"])))