示例#1
0
    def test_parse_output_file(self):
        fa = FastANI(self.cpus, force_single=True)

        out_txt = [['q1', 'r1', '83.1234', '5', '10'],
                   ['q1', 'r2', '99.1111', '3', '10'],
                   ['q2', 'r1', '1.12', '555', '1111']]

        expected = {'q1': {'r1': (83.1234, 0.5), 'r2': (99.1111, 0.3)}, 'q2': {'r1': (1.12, 0.5)}}

        path_f1 = os.path.join(self.dir_tmp, 'f1.txt')
        path_f2 = os.path.join(self.dir_tmp, 'f2.txt')
        path_f3 = os.path.join(self.dir_tmp, 'f3.txt')

        with open(path_f1, 'w') as fh:
            for x in out_txt:
                fh.write(' '.join(x) + '\n')

        result_f1 = fa.parse_output_file(path_f1)
        self.assertEqual(result_f1, expected)

        with open(path_f2, 'w') as fh:
            for x in out_txt:
                fh.write('\t'.join(x) + '\n')

        result_f2 = fa.parse_output_file(path_f2)
        self.assertEqual(result_f2, expected)

        open(path_f3, 'w').close()
        result_f3 = fa.parse_output_file(path_f3)
        self.assertEqual(result_f3, {})
示例#2
0
    def run(self, genomes, no_mash, max_d, out_dir, prefix, mash_k, mash_v,
            mash_s, min_af):
        """Runs the pipeline."""
        self.check_dependencies(no_mash)

        ref_genomes = self._get_ref_genomes()
        d_compare = defaultdict(set)
        d_paths = {**genomes, **ref_genomes}

        # Pre-filter using Mash if specified.
        if not no_mash:
            dir_mash = os.path.join(out_dir, DIR_ANI_REP_INT_MASH)
            mash = Mash(self.cpus, dir_mash, prefix)
            self.logger.info(f'Using Mash version {mash.version()}')
            mash_results = mash.run(genomes, ref_genomes, max_d, mash_k,
                                    mash_v, mash_s)
            for qry_gid, ref_hits in mash_results.items():
                d_compare[qry_gid] = d_compare[qry_gid].union(
                    set(ref_hits.keys()))

        # Compare against all reference genomes.
        else:
            for qry_gid in genomes:
                d_compare[qry_gid] = set(ref_genomes.keys())

        self.logger.info('Calculating ANI with FastANI.')
        fastani = FastANI(self.cpus, force_single=True)
        fastani_results = fastani.run(d_compare, d_paths)

        ANISummaryFile(out_dir, prefix, fastani_results)
        ANIClosestFile(out_dir, prefix, fastani_results, genomes, min_af)
示例#3
0
    def test_run(self):
        """Test that FastANI produces the expected output (version dependent)"""

        fa = FastANI(self.cpus, force_single=True)

        """
        a = d__Archaea; p__Altiarchaeota; c__Altiarchaeia; o__Altiarchaeales; f__Altiarchaeaceae; g__Altiarchaeum; s__Altiarchaeum sp001873845
        b = d__Archaea; p__Altiarchaeota; c__Altiarchaeia; o__Altiarchaeales; f__Altiarchaeaceae; g__Altiarchaeum; s__Altiarchaeum sp002083985
        c = d__Bacteria; p__Aquificota; c__Desulfurobacteriia; o__Desulfurobacteriales; f__Desulfurobacteriaceae; g__Thermovibrio; s__Thermovibrio ammonificans
        x = d__Archaea; p__Altiarchaeota; c__Altiarchaeia; o__GCA-002841105; f__GCA-002841105; g__GCA-002841105; s__GCA-002841105 sp002841105
        y = d__Bacteria; p__Aerophobota; c__Aerophobia; o__Aerophobales; f__Aerophobaceae; g__Aerophobus; s__Aerophobus profundus
        z = d__Archaea; p__Halobacterota; c__Archaeoglobi; o__JdFR-21; f__JdFR-21; g__JdFR-21; s__JdFR-21 sp002011165
        """
        d_compare = {'a': {'x', 'y'},
                     'b': {'x'},
                     'c': {'z'}}
        d_paths = {'a': os.path.join(self.genome_root,'GCA/001/873/845', 'GCA_001873845.1_genomic.fna.gz'),
                   'b': os.path.join(self.genome_root,'GCA/002/083/985', 'GCA_002083985.1_genomic.fna.gz'),
                   'c': os.path.join(self.genome_root,'GCF/000/185/805', 'GCF_000185805.1_genomic.fna.gz'),
                   'x': os.path.join(self.genome_root,'GCA/002/841/105', 'GCA_002841105.1_genomic.fna.gz'),
                   'y': os.path.join(self.genome_root,'GCA/000/402/295', 'GCA_000402295.1_genomic.fna.gz'),
                   'z': os.path.join(self.genome_root,'GCA/002/011/165', 'GCA_002011165.1_genomic.fna.gz')}

        result = fa.run(d_compare, d_paths)

        expected = {'a': {'x': {'ani': 82.5201, 'af': 0.57},
                          'y': {'ani': 74.5154, 'af': 0.0}},
                    'b': {'x': {'ani': 84.5846, 'af': 0.55}},
                    'c': {'z': {'ani': 74.4978, 'af': 0.01}}}
        self.assertEqual(json.dumps(result, sort_keys=True), json.dumps(expected, sort_keys=True))
示例#4
0
文件: ani_rep.py 项目: fplaza/GTDBTk
    def run(self, genomes, no_mash, max_d, out_dir, prefix, mash_k, mash_v,
            mash_s, min_af, mash_db):
        """Runs the pipeline.

        Parameters
        ----------
        genomes : dict[str, str]
            Dict[genome_id] = fasta_path
        no_mash : bool
            True if Mash should be used for pre-filtering, False otherwise.
        max_d : float
             maximum distance to keep [0-1]
        out_dir : str
            The directory to write the output files to.
        prefix : str
            The prefix to use when writing output files.
        mash_k : int
            k-mer size [1-32]
        mash_v : float
            maximum p-value to keep [0-1]
        mash_s : int
            maximum number of non-redundant hashes
        min_af : float
            alignment fraction to consider closest genome
        mash_db : Optional[str]
            The path to read/write the pre-computed Mash reference sketch database.
        """
        self.check_dependencies(no_mash)

        self.logger.info('Loading reference genomes.')
        ref_genomes = self._get_ref_genomes()
        d_compare = defaultdict(set)
        d_paths = {**genomes, **ref_genomes}

        # Pre-filter using Mash if specified.
        if not no_mash:
            dir_mash = os.path.join(out_dir, DIR_ANI_REP_INT_MASH)

            mash = Mash(self.cpus, dir_mash, prefix)
            self.logger.info(f'Using Mash version {mash.version()}')
            mash_results = mash.run(genomes, ref_genomes, max_d, mash_k,
                                    mash_v, mash_s, mash_db)
            for qry_gid, ref_hits in mash_results.items():
                d_compare[qry_gid] = d_compare[qry_gid].union(
                    set(ref_hits.keys()))

        # Compare against all reference genomes.
        else:
            for qry_gid in genomes:
                d_compare[qry_gid] = set(ref_genomes.keys())

        self.logger.info(
            f'Calculating ANI with FastANI v{FastANI._get_version()}.')
        fastani = FastANI(self.cpus, force_single=True)
        fastani_results = fastani.run(d_compare, d_paths)

        taxonomy = Taxonomy().read(TAXONOMY_FILE, canonical_ids=True)
        ANISummaryFile(out_dir, prefix, fastani_results, taxonomy)
        ANIClosestFile(out_dir, prefix, fastani_results, genomes, min_af,
                       taxonomy)