def test_parse_output_file(self): fa = FastANI(self.cpus, force_single=True) out_txt = [['q1', 'r1', '83.1234', '5', '10'], ['q1', 'r2', '99.1111', '3', '10'], ['q2', 'r1', '1.12', '555', '1111']] expected = {'q1': {'r1': (83.1234, 0.5), 'r2': (99.1111, 0.3)}, 'q2': {'r1': (1.12, 0.5)}} path_f1 = os.path.join(self.dir_tmp, 'f1.txt') path_f2 = os.path.join(self.dir_tmp, 'f2.txt') path_f3 = os.path.join(self.dir_tmp, 'f3.txt') with open(path_f1, 'w') as fh: for x in out_txt: fh.write(' '.join(x) + '\n') result_f1 = fa.parse_output_file(path_f1) self.assertEqual(result_f1, expected) with open(path_f2, 'w') as fh: for x in out_txt: fh.write('\t'.join(x) + '\n') result_f2 = fa.parse_output_file(path_f2) self.assertEqual(result_f2, expected) open(path_f3, 'w').close() result_f3 = fa.parse_output_file(path_f3) self.assertEqual(result_f3, {})
def run(self, genomes, no_mash, max_d, out_dir, prefix, mash_k, mash_v, mash_s, min_af): """Runs the pipeline.""" self.check_dependencies(no_mash) ref_genomes = self._get_ref_genomes() d_compare = defaultdict(set) d_paths = {**genomes, **ref_genomes} # Pre-filter using Mash if specified. if not no_mash: dir_mash = os.path.join(out_dir, DIR_ANI_REP_INT_MASH) mash = Mash(self.cpus, dir_mash, prefix) self.logger.info(f'Using Mash version {mash.version()}') mash_results = mash.run(genomes, ref_genomes, max_d, mash_k, mash_v, mash_s) for qry_gid, ref_hits in mash_results.items(): d_compare[qry_gid] = d_compare[qry_gid].union( set(ref_hits.keys())) # Compare against all reference genomes. else: for qry_gid in genomes: d_compare[qry_gid] = set(ref_genomes.keys()) self.logger.info('Calculating ANI with FastANI.') fastani = FastANI(self.cpus, force_single=True) fastani_results = fastani.run(d_compare, d_paths) ANISummaryFile(out_dir, prefix, fastani_results) ANIClosestFile(out_dir, prefix, fastani_results, genomes, min_af)
def test_run(self): """Test that FastANI produces the expected output (version dependent)""" fa = FastANI(self.cpus, force_single=True) """ a = d__Archaea; p__Altiarchaeota; c__Altiarchaeia; o__Altiarchaeales; f__Altiarchaeaceae; g__Altiarchaeum; s__Altiarchaeum sp001873845 b = d__Archaea; p__Altiarchaeota; c__Altiarchaeia; o__Altiarchaeales; f__Altiarchaeaceae; g__Altiarchaeum; s__Altiarchaeum sp002083985 c = d__Bacteria; p__Aquificota; c__Desulfurobacteriia; o__Desulfurobacteriales; f__Desulfurobacteriaceae; g__Thermovibrio; s__Thermovibrio ammonificans x = d__Archaea; p__Altiarchaeota; c__Altiarchaeia; o__GCA-002841105; f__GCA-002841105; g__GCA-002841105; s__GCA-002841105 sp002841105 y = d__Bacteria; p__Aerophobota; c__Aerophobia; o__Aerophobales; f__Aerophobaceae; g__Aerophobus; s__Aerophobus profundus z = d__Archaea; p__Halobacterota; c__Archaeoglobi; o__JdFR-21; f__JdFR-21; g__JdFR-21; s__JdFR-21 sp002011165 """ d_compare = {'a': {'x', 'y'}, 'b': {'x'}, 'c': {'z'}} d_paths = {'a': os.path.join(self.genome_root,'GCA/001/873/845', 'GCA_001873845.1_genomic.fna.gz'), 'b': os.path.join(self.genome_root,'GCA/002/083/985', 'GCA_002083985.1_genomic.fna.gz'), 'c': os.path.join(self.genome_root,'GCF/000/185/805', 'GCF_000185805.1_genomic.fna.gz'), 'x': os.path.join(self.genome_root,'GCA/002/841/105', 'GCA_002841105.1_genomic.fna.gz'), 'y': os.path.join(self.genome_root,'GCA/000/402/295', 'GCA_000402295.1_genomic.fna.gz'), 'z': os.path.join(self.genome_root,'GCA/002/011/165', 'GCA_002011165.1_genomic.fna.gz')} result = fa.run(d_compare, d_paths) expected = {'a': {'x': {'ani': 82.5201, 'af': 0.57}, 'y': {'ani': 74.5154, 'af': 0.0}}, 'b': {'x': {'ani': 84.5846, 'af': 0.55}}, 'c': {'z': {'ani': 74.4978, 'af': 0.01}}} self.assertEqual(json.dumps(result, sort_keys=True), json.dumps(expected, sort_keys=True))
def run(self, genomes, no_mash, max_d, out_dir, prefix, mash_k, mash_v, mash_s, min_af, mash_db): """Runs the pipeline. Parameters ---------- genomes : dict[str, str] Dict[genome_id] = fasta_path no_mash : bool True if Mash should be used for pre-filtering, False otherwise. max_d : float maximum distance to keep [0-1] out_dir : str The directory to write the output files to. prefix : str The prefix to use when writing output files. mash_k : int k-mer size [1-32] mash_v : float maximum p-value to keep [0-1] mash_s : int maximum number of non-redundant hashes min_af : float alignment fraction to consider closest genome mash_db : Optional[str] The path to read/write the pre-computed Mash reference sketch database. """ self.check_dependencies(no_mash) self.logger.info('Loading reference genomes.') ref_genomes = self._get_ref_genomes() d_compare = defaultdict(set) d_paths = {**genomes, **ref_genomes} # Pre-filter using Mash if specified. if not no_mash: dir_mash = os.path.join(out_dir, DIR_ANI_REP_INT_MASH) mash = Mash(self.cpus, dir_mash, prefix) self.logger.info(f'Using Mash version {mash.version()}') mash_results = mash.run(genomes, ref_genomes, max_d, mash_k, mash_v, mash_s, mash_db) for qry_gid, ref_hits in mash_results.items(): d_compare[qry_gid] = d_compare[qry_gid].union( set(ref_hits.keys())) # Compare against all reference genomes. else: for qry_gid in genomes: d_compare[qry_gid] = set(ref_genomes.keys()) self.logger.info( f'Calculating ANI with FastANI v{FastANI._get_version()}.') fastani = FastANI(self.cpus, force_single=True) fastani_results = fastani.run(d_compare, d_paths) taxonomy = Taxonomy().read(TAXONOMY_FILE, canonical_ids=True) ANISummaryFile(out_dir, prefix, fastani_results, taxonomy) ANIClosestFile(out_dir, prefix, fastani_results, genomes, min_af, taxonomy)