Python PathwaysDatabase примеры, humann.store.PathwaysDatabase Python примеры использования

Пример #1

0

Показать файл

    def test_compute_structured_pathway_abundance_or_coverage_test_abundance_with_OR(
            self):
        """
        Test the compute_structured_pathway_abundance_or_coverage function for abundance
        Test the PathwaysDatabase add and get pathway structure along with key reactions
        Test with an OR structure
        """

        # Create the database structure
        pathways_database_store = store.PathwaysDatabase()
        structure_string = " A B C ( E , F )"
        pathways_database_store.add_pathway_structure("pathway1",
                                                      structure_string)

        reaction_scores = {"A": 1, "B": 2, "C": 3, "E": 4, "F": 5}
        structure = pathways_database_store.get_structure_for_pathway(
            "pathway1")
        key_reactions = pathways_database_store.get_key_reactions_for_pathway(
            "pathway1")

        # Compute the abundance
        abundance = modules.compute_structured_pathway_abundance_or_coverage(
            structure, key_reactions, reaction_scores, False, 0)

        # Compute the expected abundance which is the harmonic mean of the values with the max for the OR
        or_abundance = max([reaction_scores["E"]] + [reaction_scores["F"]])
        del reaction_scores["E"]
        del reaction_scores["F"]
        reaction_scores["E_or_F"] = or_abundance
        expected_abundance = len(reaction_scores.values()) / sum(
            1.0 / v for v in reaction_scores.values())

        self.assertEqual(abundance, expected_abundance)

Пример #2

0

Показать файл

Файл: advanced_tests_store.py Проект: parthoBTK/humann

    def test_PathwaysDatabase_add_pathway_structure_test_key_reactions_not_included_in_reactions_database(
            self):
        """
        Pathways database class: Test the add pathway structure
        Test the function with a structure with two starting points that contract
        Test the key reactions are correct for reactions that are not included in the reactions database
        Test that key reactions included are correct for reactions with 1 and 3 optional indicators
        """

        # Create a reactions database of a subset of the reactions in the pathways
        reactions_database_store = store.ReactionsDatabase()

        reactions = {
            "A": ["gene1", "gene2"],
            "---B": ["gene3"],
            "--Z": ["gene4"],
            "-F": ["gene5"]
        }
        reactions_database_store.add_reactions(reactions)

        pathways_database_store = store.PathwaysDatabase()

        structure_string = "( (  L A ---B ) , ( --Z ---C D ) )  -E -F"

        pathways_database_store.add_pathway_structure(
            "pathway1", structure_string, reactions_database_store)

        expected_key_reactions = ["L", "A", "---B", "--Z", "D", "-F"]

        self.assertEqual(
            expected_key_reactions,
            pathways_database_store.get_key_reactions_for_pathway("pathway1"))

Пример #3

0

Показать файл

    def test_compute_structured_pathway_abundance_or_coverage_test_coverage_missing_optional_reaction(
            self):
        """
        Test the compute_structured_pathway_abundance_or_coverage function for a simple structure with coverage
        Test the PathwaysDatabase add and get pathway structure along with key reactions
        Test with an optional reaction missing
        """

        # Create the database structure
        pathways_database_store = store.PathwaysDatabase()
        structure_string = " A -B C "
        pathways_database_store.add_pathway_structure("pathway1",
                                                      structure_string)

        reaction_scores = {"A": 1, "B": 0, "C": 3}
        structure = pathways_database_store.get_structure_for_pathway(
            "pathway1")
        key_reactions = pathways_database_store.get_key_reactions_for_pathway(
            "pathway1")
        median = 1

        # Compute the coverage
        coverage = modules.compute_structured_pathway_abundance_or_coverage(
            structure, key_reactions, reaction_scores, True, median)

        # Compute the expected coverage which is the harmonic mean of the chi2cdf for the required reactions
        del reaction_scores["B"]
        expected_coverage = modules.harmonic_mean(
            [chi2cdf.chi2cdf(v, median) for v in reaction_scores.values()])

        self.assertEqual(coverage, expected_coverage)

Пример #4

0

Показать файл

    def test_compute_structured_pathway_abundance_or_coverage_test_coverage_missing_required_reaction(
            self):
        """
        Test the compute_structured_pathway_abundance_or_coverage function for a simple structure with coverage
        Test the PathwaysDatabase add and get pathway structure along with key reactions
        Test with a required reaction missing
        """

        # Create the database structure
        pathways_database_store = store.PathwaysDatabase()
        structure_string = " A B C "
        pathways_database_store.add_pathway_structure("pathway1",
                                                      structure_string)

        reaction_scores = {"A": 1, "B": 0, "C": 3}
        structure = pathways_database_store.get_structure_for_pathway(
            "pathway1")
        key_reactions = pathways_database_store.get_key_reactions_for_pathway(
            "pathway1")
        median = 1

        # Compute the coverage
        coverage = modules.compute_structured_pathway_abundance_or_coverage(
            structure, key_reactions, reaction_scores, True, median)

        # Compute the expected coverage which is the harmonic mean of the chi2cdf
        # This is zero since one required reaction is missing
        expected_coverage = 0

        self.assertEqual(coverage, expected_coverage)

Пример #5

0

Показать файл

    def test_compute_structured_pathway_abundance_or_coverage_test_abundance(
            self):
        """
        Test the compute_structured_pathway_abundance_or_coverage function for a simple structure with abundance
        Test the PathwaysDatabase add and get pathway structure along with key reactions
        """

        # Create the database structure
        pathways_database_store = store.PathwaysDatabase()
        structure_string = " A B C "
        pathways_database_store.add_pathway_structure("pathway1",
                                                      structure_string)

        reaction_scores = {"A": 1, "B": 2, "C": 3}
        structure = pathways_database_store.get_structure_for_pathway(
            "pathway1")
        key_reactions = pathways_database_store.get_key_reactions_for_pathway(
            "pathway1")

        # Compute the abundance
        abundance = modules.compute_structured_pathway_abundance_or_coverage(
            structure, key_reactions, reaction_scores, False, 0)

        # Compute the expected abundance which is the harmonic mean of the values
        expected_abundance = len(reaction_scores.values()) / sum(
            1.0 / v for v in reaction_scores.values())

        self.assertEqual(abundance, expected_abundance)

Пример #6

0

Показать файл

    def test_compute_structured_pathway_abundance_or_coverage_test_abundance_missing_required_reaction(
            self):
        """
        Test the compute_structured_pathway_abundance_or_coverage function for a simple structure with abundance
        Test the PathwaysDatabase add and get pathway structure along with key reactions
        Test with a required reaction missing
        """

        # Create the database structure
        pathways_database_store = store.PathwaysDatabase()
        structure_string = " A B C "
        pathways_database_store.add_pathway_structure("pathway1",
                                                      structure_string)

        reaction_scores = {"A": 1, "B": 0, "C": 3}
        structure = pathways_database_store.get_structure_for_pathway(
            "pathway1")
        key_reactions = pathways_database_store.get_key_reactions_for_pathway(
            "pathway1")

        # Compute the abundance
        abundance = modules.compute_structured_pathway_abundance_or_coverage(
            structure, key_reactions, reaction_scores, False, 0)

        # Compute the expected abundance which is the harmonic mean of the values that is 0 in the case of a missing reaction
        expected_abundance = 0

        self.assertEqual(abundance, expected_abundance)

Пример #7

0

Показать файл

    def test_compute_pathways_coverage_structured(self):
        """
        Test the compute_pathways_coverage function
        Test PathwaysDatabase add
        Test PathwaysAndReactions store
        Test Pathways store
        Test with structured pathways
        """

        # Set xipe to off
        config.xipe_toggle = "off"

        # Create the database structure
        pathways_database_store = store.PathwaysDatabase()
        pathways_database_store.add_pathway_structure("pathway1", " A B C D ")
        pathways_database_store.add_pathway_structure("pathway2",
                                                      " A B C D E F ")

        # Have all test data be from the same bug
        bug = "bug"
        pathways_and_reactions_store = store.PathwaysAndReactions()
        # Just a note that a value of 1 has a chi2cdf value of 0
        # Also values ~10 or less have small chi2cdf values
        pathways_and_reactions_store.add(bug, "A", "pathway1", 11)
        pathways_and_reactions_store.add(bug, "B", "pathway1", 12)
        pathways_and_reactions_store.add(bug, "C", "pathway1", 13)
        pathways_and_reactions_store.add(bug, "D", "pathway1", 14)
        pathways_and_reactions_store.add(bug, "A", "pathway2", 19)
        pathways_and_reactions_store.add(bug, "B", "pathway2", 20)
        pathways_and_reactions_store.add(bug, "C", "pathway2", 30)
        pathways_and_reactions_store.add(bug, "D", "pathway2", 40)
        pathways_and_reactions_store.add(bug, "E", "pathway2", 50)
        pathways_and_reactions_store.add(bug, "F", "pathway2", 60)

        # Get the coverage result
        # The median is the median of all of the reactions of all of the pathways for this bug
        median_score_value = 19.5
        # boost the pathway values
        pathway1_values_boosted = [12, 12, 13, 14]
        coverage_pathway1 = len(pathway1_values_boosted) / sum(
            1.0 / chi2cdf.chi2cdf(v, median_score_value)
            for v in pathway1_values_boosted)

        pathway2_values_boosted = [20, 20, 30, 40, 50, 60]
        coverage_pathway2 = len(pathway2_values_boosted) / sum(
            1.0 / chi2cdf.chi2cdf(v, median_score_value)
            for v in pathway2_values_boosted)

        # Find the actual result
        pathways_abundance_store_result = modules.compute_pathways_coverage(
            pathways_and_reactions_store, pathways_database_store)

        # Test the pathways abundance match those expected
        self.assertEqual(
            pathways_abundance_store_result.get_score_for_bug(bug, "pathway1"),
            coverage_pathway1)
        self.assertEqual(
            pathways_abundance_store_result.get_score_for_bug(bug, "pathway2"),
            coverage_pathway2)

Пример #8

0

Показать файл

    def test_pathways_abundance_with_names(self):
        """
        Test the pathways abundance computation (xipe and minpath are off)
        Test the pathways print function
        Test the pathways mapping to names
        Test the unmapped and unintegrated values are printed
        """

        # update the max decimals to allow for rounding
        config.output_max_decimals = 7

        # Load in the pathways databases
        reactions_database = store.ReactionsDatabase(
            config.pathways_database_part1)
        pathways_database = store.PathwaysDatabase(
            config.pathways_database_part2, reactions_database)

        # Load in the gene scores from the file
        # This file has the gene names included
        gene_scores = store.GeneScores()
        gene_scores.add_from_file(
            cfg.larger_gene_families_uniref50_with_names_file)

        # Turn off xipe and minpath
        minpath_toggle_original = config.minpath_toggle
        config.minpath_toggle = "off"
        xipe_toggle_original = config.xipe_toggle
        config.xipe_toggle = "off"

        pathways_and_reactions_store = modules.identify_reactions_and_pathways(
            gene_scores, reactions_database, pathways_database)

        # set the locations to write as temp files
        file_out, abundance_file = tempfile.mkstemp()
        os.close(file_out)
        config.pathabundance_file = abundance_file

        file_out, coverage_file = tempfile.mkstemp()
        os.close(file_out)
        config.pathcoverage_file = coverage_file

        unaligned_reads_count = 10
        abundance_file, coverage_file = modules.compute_pathways_abundance_and_coverage(
            gene_scores, reactions_database, pathways_and_reactions_store,
            pathways_database, unaligned_reads_count)

        # Reset xipe and minpath
        config.minpath_toggle = minpath_toggle_original
        config.xipe_toggle = xipe_toggle_original

        # check the output is as expected
        self.assertTrue(
            filecmp.cmp(abundance_file,
                        cfg.demo_pathabundance_file,
                        shallow=False))

        utils.remove_temp_file(abundance_file)
        utils.remove_temp_file(coverage_file)

Пример #9

0

Показать файл

Файл: advanced_tests_store.py Проект: parthoBTK/humann

    def test_PathwaysDatabase_is_structured_structure(self):
        """
        Pathways database class: Test the storing of a structured set of pathways
        Test this file is identified as structured
        """

        pathways_database_store = store.PathwaysDatabase(cfg.pathways_file)

        self.assertTrue(pathways_database_store.is_structured())

Пример #10

0

Показать файл

    def test_compute_pathways_coverage_unstructured(self):
        """
        Test the compute_pathways_coaverage function
        Test PathwaysDatabase add
        Test PathwaysAndReactions store
        Test Pathways store
        Test with unstructured pathways
        """

        # Set xipe to off
        config.xipe_toggle = "off"

        # Create the database structure
        pathways_database_store = store.PathwaysDatabase()
        pathways_database_store.add_pathway("pathway1", ["A", "B", "C", "D"])
        pathways_database_store.add_pathway("pathway2",
                                            ["A", "B", "C", "D", "E", "F"])

        # Have all test data be from the same bug
        bug = "bug"
        pathways_and_reactions_store = store.PathwaysAndReactions()
        pathways_and_reactions_store.add(bug, "A", "pathway1", 1)
        # Note B is not recored for pathway1 which will result in a zero value
        pathways_and_reactions_store.add(bug, "C", "pathway1", 3)
        pathways_and_reactions_store.add(bug, "D", "pathway1", 40)
        pathways_and_reactions_store.add(bug, "A", "pathway2", 10)
        pathways_and_reactions_store.add(bug, "B", "pathway2", 20)
        # Note C is not recored for pathway2 which will result in a zero value
        pathways_and_reactions_store.add(bug, "D", "pathway2", 40)
        pathways_and_reactions_store.add(bug, "E", "pathway2", 50)
        # Note F is not recored for pathway2 which will result in a zero value

        # The coverage for each pathway if the number of reactions greater than the median
        # divided by the total reactions in the pathway
        # The median for this set is 20
        pathway1_values = [0, 1, 3, 40]
        count_greater_than_median = 1
        pathway1_coverage = count_greater_than_median / float(
            len(pathway1_values))

        pathway2_values = [0, 0, 10, 20, 40, 50]
        count_greater_than_median = 2
        pathway2_coverage = count_greater_than_median / float(
            len(pathway2_values))

        pathways_coverage_store_result = modules.compute_pathways_coverage(
            pathways_and_reactions_store, pathways_database_store)

        # Test the pathways abundance match those expected
        self.assertEqual(
            pathways_coverage_store_result.get_score_for_bug(bug, "pathway1"),
            pathway1_coverage)
        self.assertEqual(
            pathways_coverage_store_result.get_score_for_bug(bug, "pathway2"),
            pathway2_coverage)

Пример #11

0

Показать файл

    def test_compute_pathways_abundance_unstructured(self):
        """
        Test the compute_pathways_abundance function
        Test PathwaysDatabase add
        Test PathwaysAndReactions store
        Test Pathways store
        Test with unstructured pathways
        """

        # Create the database structure
        pathways_database_store = store.PathwaysDatabase()
        pathways_database_store.add_pathway("pathway1", ["A", "B", "C", "D"])
        pathways_database_store.add_pathway("pathway2",
                                            ["A", "B", "C", "D", "E", "F"])

        # Have all test data be from the same bug
        bug = "bug"
        pathways_and_reactions_store = store.PathwaysAndReactions()
        pathways_and_reactions_store.add(bug, "A", "pathway1", 1)
        # Note B is not recored for pathway1 which will result in a zero value
        pathways_and_reactions_store.add(bug, "C", "pathway1", 3)
        pathways_and_reactions_store.add(bug, "D", "pathway1", 4)
        pathways_and_reactions_store.add(bug, "A", "pathway2", 10)
        pathways_and_reactions_store.add(bug, "B", "pathway2", 20)
        # Note C is not recored for pathway2 which will result in a zero value
        pathways_and_reactions_store.add(bug, "D", "pathway2", 40)
        pathways_and_reactions_store.add(bug, "E", "pathway2", 50)
        # Note F is not recored for pathway2 which will result in a zero value

        # The abundance for each pathway is the average of the largest half of the reaction values
        # For unstructured pathways, if the reaction is not included it does not result in a zero abundance
        pathway1_values = [0, 1, 3, 4]
        pathway1_abundance_set = pathway1_values[int(len(pathway1_values) /
                                                     2):]
        pathway1_abundance = sum(pathway1_abundance_set) / len(
            pathway1_abundance_set)

        pathway2_values = [0, 0, 10, 20, 40, 50]
        pathway2_abundance_set = pathway2_values[int(len(pathway2_values) /
                                                     2):]
        pathway2_abundance = sum(pathway2_abundance_set) / len(
            pathway2_abundance_set)

        pathways_abundance_store_result, reactions_in_pathways_present = modules.compute_pathways_abundance(
            pathways_and_reactions_store, pathways_database_store)

        # Test the pathways abundance match those expected
        self.assertEqual(
            pathways_abundance_store_result.get_score_for_bug(bug, "pathway1"),
            pathway1_abundance)
        self.assertEqual(
            pathways_abundance_store_result.get_score_for_bug(bug, "pathway2"),
            pathway2_abundance)

Пример #12

0

Показать файл

    def test_compute_pathways_abundance_structured(self):
        """
        Test the compute_pathways_abundance function
        Test PathwaysDatabase add
        Test PathwaysAndReactions store
        Test Pathways store
        Test with structured pathways
        """

        # Create the database structure
        pathways_database_store = store.PathwaysDatabase()
        pathways_database_store.add_pathway_structure("pathway1", " A B C D ")
        pathways_database_store.add_pathway_structure("pathway2",
                                                      " A B C D E F ")

        # Have all test data be from the same bug
        bug = "bug"
        pathways_and_reactions_store = store.PathwaysAndReactions()
        # Just a note that a value of 1 has a chi2cdf value of 0
        # Also values ~10 or less have small chi2cdf values
        pathways_and_reactions_store.add(bug, "A", "pathway1", 11)
        pathways_and_reactions_store.add(bug, "B", "pathway1", 12)
        pathways_and_reactions_store.add(bug, "C", "pathway1", 13)
        pathways_and_reactions_store.add(bug, "D", "pathway1", 14)
        pathways_and_reactions_store.add(bug, "A", "pathway2", 19)
        pathways_and_reactions_store.add(bug, "B", "pathway2", 20)
        pathways_and_reactions_store.add(bug, "C", "pathway2", 30)
        pathways_and_reactions_store.add(bug, "D", "pathway2", 40)
        pathways_and_reactions_store.add(bug, "E", "pathway2", 50)
        pathways_and_reactions_store.add(bug, "F", "pathway2", 60)

        # The abundance for each pathway is the harmonic mean of the values
        # boost the lowest value in the pathway
        pathway1_values_boosted = [12, 12, 13, 14]
        pathway1_abundance = len(pathway1_values_boosted) / sum(
            1.0 / v for v in pathway1_values_boosted)

        pathway2_values_boosted = [20, 20, 30, 40, 50, 60]
        pathway2_abundance = len(pathway2_values_boosted) / sum(
            1.0 / v for v in pathway2_values_boosted)

        # Find the actual result
        pathways_abundance_store_result, reactions_in_pathways_present = modules.compute_pathways_abundance(
            pathways_and_reactions_store, pathways_database_store)

        # Test the pathways abundance match those expected
        self.assertEqual(
            pathways_abundance_store_result.get_score_for_bug(bug, "pathway1"),
            pathway1_abundance)
        self.assertEqual(
            pathways_abundance_store_result.get_score_for_bug(bug, "pathway2"),
            pathway2_abundance)

Пример #13

0

Показать файл

    def test_compute_pathways_abundance_unstructured_reactions_list(self):
        """
        Test the compute_pathways_abundance function
        Test PathwaysDatabase add
        Test PathwaysAndReactions store
        Test Pathways store
        Test with unstructured pathways
        Test the resulting list of reactions included in pathways
        """

        # Create the database structure
        pathways_database_store = store.PathwaysDatabase()
        pathways_database_store.add_pathway("pathway1", ["A", "B", "C", "D"])
        pathways_database_store.add_pathway("pathway2",
                                            ["A", "B", "C", "D", "E", "F"])
        pathways_database_store.add_pathway("pathway3", ["A", "B", "G"])

        # Have all test data be from two bugs
        bug = "bug1"
        pathways_and_reactions_store = store.PathwaysAndReactions()
        pathways_and_reactions_store.add(bug, "A", "pathway1", 1)
        pathways_and_reactions_store.add(bug, "B", "pathway1", 2)
        pathways_and_reactions_store.add(bug, "B", "pathway2", 2)
        pathways_and_reactions_store.add(bug, "C", "pathway2", 3)

        expected_reactions_in_pathways_present = {}
        expected_reactions_in_pathways_present[bug] = ["A", "B", "C"]

        bug = "bug2"
        pathways_and_reactions_store.add(bug, "A", "pathway1", 1)
        pathways_and_reactions_store.add(bug, "D", "pathway1", 3)
        pathways_and_reactions_store.add(bug, "B", "pathway1", 2)
        pathways_and_reactions_store.add(bug, "B", "pathway2", 2)

        expected_reactions_in_pathways_present[bug] = ["A", "B", "D"]

        pathways_abundance_store_result, reactions_in_pathways_present = modules.compute_pathways_abundance(
            pathways_and_reactions_store, pathways_database_store)

        # Test the reactions match those expected
        self.assertEqual(
            sorted(expected_reactions_in_pathways_present["bug1"]),
            sorted(list(reactions_in_pathways_present["bug1"])))
        self.assertEqual(
            sorted(expected_reactions_in_pathways_present["bug2"]),
            sorted(list(reactions_in_pathways_present["bug2"])))

Пример #14

0

Показать файл

def main():

    args = parse_arguments(sys)

    # read in the gene families to reactions database
    print("Reading gene families to reactions database")
    reactions_database = store.ReactionsDatabase(
        config.pathways_database_part1)

    # read in the reactions to pathways database
    print("Reading reactions to pathways database")
    pathways_database = store.PathwaysDatabase(config.pathways_database_part2,
                                               reactions_database)

    # read in the species and pathways selected
    print("Finding gene families for each species for the pathways selected")
    genefamilies = {}
    reactions = {}
    for line in open(args.input_selection):
        # remove starting and ending spaces
        line = line.strip()
        if not line.startswith("#"):
            try:
                species, pathway = line.split("\t")
            except IndexError:
                print("Warning: Skipping selection line because of format: " +
                      line)
                continue
            if not species in genefamilies:
                genefamilies[species] = set()
                reactions[species] = set()
            # get the reactions and then gene families for the pathway
            if pathway.upper() == "ALL":
                genefamilies[species] = "ALL"
                reactions[species] = "ALL"
            else:
                for reaction in pathways_database.find_reactions(pathway):
                    reactions[species].add(reaction)
                    genefamilies[species].update(
                        list(
                            filter(lambda x: x.startswith(args.gene_families),
                                   reactions_database.find_genes(reaction))))

    # get the set of gene families in any pathway
    all_genefamilies_in_pathways = set()
    for reaction in pathways_database.reaction_list():
        all_genefamilies_in_pathways.update(
            list(
                filter(lambda x: x.startswith(args.gene_families),
                       reactions_database.find_genes(reaction))))

    # open the output file
    try:
        file_handle = open(args.output, "w")
    except EnvironmentError:
        sys.exit("ERROR: Unable to open output file: " + args.output)

    # find the reads for the species and pathways selected
    print("Reading sam file")
    selected_reads = set()
    reads_to_gene_familes = {}
    reaction_totals = {}

    # open a fasta file to write the species specific sequences for input to metaphlan
    if args.add_markers:
        species_fasta_file = args.output + ".species_specific_reads.fasta"
        species_fasta_file_handle = open(species_fasta_file, "w")

    selected_reads_to_species = {}
    for species, gene_family, read_name, sequence, quality_scores in read_sam(
            args.input_sam, args.gene_families):
        # record the reads that align to species/gene families requested
        requested_genes_for_species = genefamilies.get(species, [])
        if (gene_family in requested_genes_for_species) or (
                "ALL" in requested_genes_for_species) or (
                    args.add_unintegrated
                    and not (gene_family in all_genefamilies_in_pathways)):
            selected_reads.add(read_name)
            selected_reads_to_species[species] = selected_reads_to_species.get(
                species, 0) + 1
            if not read_name in reads_to_gene_familes:
                reads_to_gene_familes[read_name] = set()

            reads_to_gene_familes[read_name].add(gene_family)

            # add to the reaction totals count
            for reaction in reactions_database.find_reactions(gene_family):
                reaction_totals[reaction] = reaction_totals.get(reaction,
                                                                0) + 1.0

        # if this is a species in the list, and we are adding markers, write this to the fasta file for input to metaphlan
        if args.add_markers and species in genefamilies.keys():
            write_sequence(species_fasta_file_handle, read_name, sequence,
                           quality_scores, "fasta")

    print("Total reads per species based on gene families")
    for species, count in selected_reads_to_species.items():
        print(species + "\t" + str(count))

    # close the species fasta file handle
    if args.add_markers:
        species_fasta_file_handle.close()

    print("Total reads found: " + str(len(selected_reads)))

    # get the markers reads from the species fasta file
    marker_reads_to_add = set()
    read_to_species_marker = {}
    read_to_marker_name = {}
    all_reads_mapping_to_markers = set()
    if args.add_markers:
        print("Finding reads mapped to markers with MetaPhlAn")

        print("Running MetaPhlAn")
        metaphlan_marker_file = species_fasta_file + ".marker_alignments.tsv"
        metaphlan_bowtie2_file = species_fasta_file + ".bowtie2.tsv"

        try:
            # remove the bowtie2 output file if it exists to prevent metaphlan error
            os.remove(metaphlan_bowtie2_file)
        except EnvironmentError:
            pass

        output = subprocess.check_output([
            "metaphlan", "--input_type", "fasta", species_fasta_file, "-t",
            "reads_map", "-o", metaphlan_marker_file, "--bowtie2out",
            metaphlan_bowtie2_file
        ])

        # read through the file to identify the maker reads to add
        for line in open(metaphlan_marker_file):
            if not line.startswith("#"):
                read_name, taxon = line.rstrip().split("\t")
                species = taxon.split("|")[-1]
                if species in genefamilies.keys():
                    marker_reads_to_add.add(read_name)
                    if not species in read_to_species_marker:
                        read_to_species_marker[species] = set()

                    read_to_species_marker[species].add(read_name)
                    all_reads_mapping_to_markers.add(read_name)

        # read through the file to identify the name of the marker the reads map to
        for line in open(metaphlan_bowtie2_file):
            try:
                read_name, marker_name = line.rstrip().split("\t")
            except IndexError:
                continue

            if read_name in all_reads_mapping_to_markers:
                read_to_marker_name[read_name] = marker_name

        print("Found a total of " + str(len(marker_reads_to_add)) +
              " reads to species markers")

    if args.percent < 100:
        print("Filtering reads by percent requested: " + str(args.percent))
        filtered_reads = set()
        current_reaction_totals = {
            reaction: 0
            for reaction in reaction_totals.keys()
        }
        # go through the reads, adding until there is enough in the reaction list
        # to hit the percent requested
        for read_name in selected_reads:
            # check if this read is needed into increase the reaction percents
            add_read = False
            for gene in reads_to_gene_familes[read_name]:
                for reaction in reactions_database.find_reactions(gene):
                    if (current_reaction_totals[reaction] /
                            reaction_totals[reaction]) * 100 < args.percent:
                        add_read = True

            if add_read:
                filtered_reads.add(read_name)
                # update the current reaction counts
                for gene in reads_to_gene_familes[read_name]:
                    for reaction in reactions_database.find_reactions(gene):
                        current_reaction_totals[reaction] += 1

        # update the selected reads to those that are filtered
        selected_reads = filtered_reads
        print("Total reads after filtering: " + str(len(selected_reads)))

    # check to make sure at least 200 marker reads (or command line setting) for each species are present
    # make sure there reads are spread over the markers for the sample
    if args.add_markers:
        # for each species, check the number of maker reads present
        print(
            "Counting markers in set to determine if reads need to be added to meet min markers"
        )
        for species, reads_for_species in read_to_species_marker.items():
            # count how many of the reads are in the selected reads list
            overlap = selected_reads.intersection(reads_for_species)
            total_overlap = len(list(overlap))
            if total_overlap < args.min_markers:
                # add in more reads to hit the min markers setting for this species
                max_reads_to_add = list(
                    reads_for_species.difference(selected_reads))
                # group the reads based on the species markers
                to_add_by_markers = {}
                for read_name in max_reads_to_add:
                    marker_for_read = read_to_marker_name[read_name]
                    if not marker_for_read in to_add_by_markers:
                        to_add_by_markers[marker_for_read] = set()
                    to_add_by_markers[marker_for_read].add(read_name)

                selected_reads_by_markers = {}
                for read_name in selected_reads:
                    try:
                        marker_for_read = read_to_marker_name[read_name]
                    except KeyError:
                        # ignore reads that do not map to markers
                        continue
                    if not marker_for_read in selected_reads_by_markers:
                        selected_reads_by_markers[marker_for_read] = set()
                    selected_reads_by_markers[marker_for_read].add(read_name)

                # count the total reads to add
                total_to_add = args.min_markers - total_overlap
                total_added = 0

                # get a list of all of the markers for the species
                all_species_markers = set(to_add_by_markers.keys())
                all_species_markers.update(selected_reads_by_markers.keys())

                # start with the markers not already included, adding one read to each
                for marker_name in all_species_markers.difference(
                        selected_reads_by_markers.keys()):
                    total_markers_in_set = len(
                        list(to_add_by_markers[marker_name]))
                    end_index = args.min_reads_per_marker if total_markers_in_set >= args.min_reads_per_marker else total_markers_in_set

                    # add at most min reads per markers for this marker set to the set of selected reads
                    to_add = list(to_add_by_markers[marker_name])[:end_index]
                    selected_reads.update(to_add)
                    if not marker_name in selected_reads_by_markers:
                        selected_reads_by_markers[marker_name] = set()
                    selected_reads_by_markers[marker_name].update(to_add)

                    # remove the added reads from the set of reads to add
                    to_add_by_markers[marker_name] = to_add_by_markers[
                        marker_name].difference(to_add)

                    # decrease the total amount to add
                    total_added += end_index

                    print("Adding " + str(end_index) +
                          " total reads to fill empty marker for species " +
                          species)

                # next add more reads for those sets that already have markers, starting with the smallest
                # set of markers to the largest
                marker_counts = {
                    marker: len(list(reads))
                    for marker, reads in selected_reads_by_markers.items()
                }
                for marker_name in sorted(marker_counts,
                                          key=marker_counts.get):
                    try:
                        total_markers_in_set = len(
                            list(to_add_by_markers[marker_name]))
                    except KeyError:
                        # ignore errors for markers that are not included in the to add list (only in the selected reads)
                        continue

                    end_index = args.min_reads_per_marker if total_markers_in_set >= args.min_reads_per_marker else total_markers_in_set

                    # add only so many reads to get to min reads per markers for this set
                    end_index = end_index - len(
                        list(selected_reads_by_markers[marker_name]))

                    # add at most min reads per marker reads for this marker set to the set of selected reads
                    if end_index > 0:
                        to_add = list(
                            to_add_by_markers[marker_name])[:end_index]
                        selected_reads.update(to_add)
                        selected_reads_by_markers[marker_name].update(to_add)

                        # remove the added reads from the set of reads to add
                        to_add_by_markers[marker_name] = to_add_by_markers[
                            marker_name].difference(to_add)

                        total_added += end_index
                        print(
                            "Adding " + str(end_index) +
                            " total reads to fill slightly full marker for species "
                            + species)

                # add more reads to get to the min reads added value
                total_to_add = total_to_add - total_added if total_added < total_to_add else 0
                try:
                    selected_reads.update(max_reads_to_add[:total_to_add])
                    total_added += total_to_add
                    print("Added " + str(total_to_add) +
                          " reads for species " + species +
                          " to meet min markers")
                except (TypeError, IndexError):
                    continue

            print("Total reads added for species " + species + " :" +
                  str(total_added))
        print("Total reads after counting markers: " +
              str(len(selected_reads)))

    # determine how many trimmable reads to write
    total_trimmable = 0
    trimmable = []
    if args.add_trimmable:
        total_trimmable = int(random.uniform(50, 100))
        print("Adding " + str(total_trimmable) + " total trimmable reads")

    print("Writing output file")
    for species, gene_family, read_name, sequence, quality_scores in read_sam(
            args.input_sam, args.gene_families):
        # write the read sequences requested once
        if read_name in selected_reads:
            write_sequence(file_handle, read_name, sequence, quality_scores,
                           args.output_format)
            selected_reads.remove(read_name)
            if len(trimmable) < total_trimmable:
                trimmable.append([read_name, sequence, quality_scores])

    # write the trimmable reads
    for read_name, sequence, quality_scores in trimmable:
        new_length = int(len(sequence) / 2.0)
        write_sequence(file_handle, "trimmable_" + read_name,
                       sequence[0:new_length], quality_scores[0:new_length],
                       args.output_format)

    print("Output file written: " + args.output)

Пример #15

0

Показать файл

    def test_compute_pathways_abundance_structured_reactions_list(self):
        """
        Test the compute_pathways_abundance function
        Test PathwaysDatabase add
        Test PathwaysAndReactions store
        Test Pathways store
        Test with structured pathways
        Test gap fill
        Test the resulting list of reactions included in pathways
        """

        # Create the database structure
        pathways_database_store = store.PathwaysDatabase()
        pathways_database_store.add_pathway_structure("pathway1", " A B C D ")
        pathways_database_store.add_pathway_structure("pathway2",
                                                      " A B C D E F ")
        pathways_database_store.add_pathway_structure("pathway3", " A B G")

        # Have all test data be from three bugs
        bug = "bug1"
        pathways_and_reactions_store = store.PathwaysAndReactions()
        pathways_and_reactions_store.add(bug, "A", "pathway1", 1)
        pathways_and_reactions_store.add(bug, "B", "pathway1", 2)
        pathways_and_reactions_store.add(bug, "C", "pathway1", 3)

        expected_reactions_in_pathways_present = {}
        # This pathway is present because D is filled in
        # Though D does not have abundance so it is not included in the list
        expected_reactions_in_pathways_present[bug] = ["A", "B", "C"]

        bug = "bug2"
        pathways_and_reactions_store.add(bug, "A", "pathway1", 1)
        pathways_and_reactions_store.add(bug, "B", "pathway1", 3)
        pathways_and_reactions_store.add(bug, "D", "pathway2", 2)
        pathways_and_reactions_store.add(bug, "B", "pathway2", 2)

        # The pathways for this bug are missing too many reactions to have abundance
        expected_reactions_in_pathways_present[bug] = []

        bug = "bug3"
        pathways_and_reactions_store.add(bug, "A", "pathway3", 1)
        pathways_and_reactions_store.add(bug, "B", "pathway3", 3)
        pathways_and_reactions_store.add(bug, "G", "pathway3", 2)
        pathways_and_reactions_store.add(bug, "B", "pathway2", 2)

        # One pathway for this bug includes all reactions
        expected_reactions_in_pathways_present[bug] = ["A", "B", "G"]

        pathways_abundance_store_result, reactions_in_pathways_present = modules.compute_pathways_abundance(
            pathways_and_reactions_store, pathways_database_store)

        # Test the reactions match those expected
        self.assertEqual(
            sorted(expected_reactions_in_pathways_present["bug1"]),
            sorted(list(reactions_in_pathways_present["bug1"])))
        self.assertEqual(
            sorted(expected_reactions_in_pathways_present["bug2"]),
            sorted(list(reactions_in_pathways_present["bug2"])))
        self.assertEqual(
            sorted(expected_reactions_in_pathways_present["bug3"]),
            sorted(list(reactions_in_pathways_present["bug3"])))

Python PathwaysDatabase примеры использования