def test_aniblastall_concordance(self): """ANIblastall results concordant with JSpecies.""" # Perform ANIblastall on the input directory contents outdir = os.path.join(self.outdir, "blastall") os.makedirs(outdir, exist_ok=True) fragfiles, fraglengths = anib.fragment_fasta_files( self.infiles, outdir, self.fragsize ) jobgraph = anib.make_job_graph( self.infiles, fragfiles, anib.make_blastcmd_builder("ANIblastall", outdir) ) assert_equal(0, run_mp.run_dependency_graph(jobgraph)) results = anib.process_blast( outdir, self.orglengths, fraglengths, mode="ANIblastall" ) result_pid = results.percentage_identity result_pid.to_csv(os.path.join(self.outdir, "pyani_aniblastall.tab"), sep="\t") # Compare JSpecies output to results result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0 diffmat = result_pid.values - self.target["ANIb"].values aniblastall_diff = pd.DataFrame( diffmat, index=result_pid.index, columns=result_pid.columns ) aniblastall_diff.to_csv( os.path.join(self.outdir, "pyani_aniblastall_diff.tab"), sep="\t" ) assert_less(aniblastall_diff.abs().values.max(), self.tolerance["ANIblastall"])
def test_aniblastall_concordance(self): """Check ANIblastall results are concordant with JSpecies.""" # Perform ANIblastall on the input directory contents outdir = self.outdir / "blastall" outdir.mkdir(exist_ok=True) fragfiles, fraglengths = anib.fragment_fasta_files( self.infiles, outdir, self.fragsize) jobgraph = anib.make_job_graph( self.infiles, fragfiles, anib.make_blastcmd_builder("ANIblastall", outdir)) self.assertEqual(0, run_mp.run_dependency_graph(jobgraph)) results = anib.process_blast(outdir, self.orglengths, fraglengths, mode="ANIblastall") result_pid = results.percentage_identity result_pid.to_csv(self.outdir / "pyani_aniblastall.tab", sep="\t") # Compare JSpecies output to results result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0 diffmat = result_pid.values - self.target["ANIb"].values aniblastall_diff = pd.DataFrame(diffmat, index=result_pid.index, columns=result_pid.columns) aniblastall_diff.to_csv(self.outdir / "pyani_aniblastall_diff.tab", sep="\t") self.assertLess(aniblastall_diff.abs().values.max(), self.tolerance["ANIblastall"])
def test_aniblastall_concordance( paths_concordance_fna, path_concordance_jspecies, tolerance_anib_hi, fragment_length, tmp_path, ): """Check ANIblastall results are concordant with JSpecies.""" # Get lengths of input genomes orglengths = pyani_files.get_sequence_lengths(paths_concordance_fna) # Perform ANIblastall on the input directory contents fragfiles, fraglengths = anib.fragment_fasta_files( paths_concordance_fna, tmp_path, fragment_length ) jobgraph = anib.make_job_graph( paths_concordance_fna, fragfiles, anib.make_blastcmd_builder("ANIblastall", tmp_path), ) assert 0 == run_mp.run_dependency_graph(jobgraph) # Jobs must run correctly # Process BLAST output result_pid = anib.process_blast( tmp_path, orglengths, fraglengths, mode="ANIblastall" ).percentage_identity # Compare JSpecies output to results result_pid = (result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0).values tgt_pid = parse_jspecies(path_concordance_jspecies)["ANIb"].values assert result_pid - tgt_pid == pytest.approx(0, abs=tolerance_anib_hi)
def test_aniblastall_concordance(): """Test concordance of ANIblastall method with JSpecies output.""" # Make/check output directory mode = "ANIblastall" outdirname = delete_and_remake_outdir(mode) # Get dataframes of JSpecies output aniblastall_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIb') # Identify our input files, and the total lengths of each organism seq infiles = pyani_files.get_fasta_files(INDIRNAME) org_lengths = pyani_files.get_sequence_lengths(infiles) # Test ANIblastall concordance: # Make fragments fragfiles, fraglengths = anib.fragment_fasta_files(infiles, outdirname, pyani_config.FRAGSIZE) # Build jobgraph jobgraph = anib.make_job_graph( infiles, fragfiles, anib.make_blastcmd_builder("ANIblastall", outdirname)) print("\nJobgraph:\n", jobgraph) print("\nJob 0:\n", jobgraph[0].script) # Run jobgraph with multiprocessing run_dependency_graph(jobgraph) print("Ran multiprocessing jobs") # Process BLAST; the pid data is in anib_data[1] aniblastall_data = anib.process_blast(outdirname, org_lengths, fraglengths, mode="ANIblastall") aniblastall_pid = \ aniblastall_data.percentage_identity.sort_index(axis=0).\ sort_index(axis=1) * 100. index, columns = aniblastall_pid.index, aniblastall_pid.columns diffmat = aniblastall_pid.as_matrix() - aniblastall_jspecies.as_matrix() aniblastall_diff = pd.DataFrame(diffmat, index=index, columns=columns) # Write dataframes to file, for reference aniblastall_pid.to_csv(os.path.join(outdirname, 'ANIblastall_pid.tab'), sep='\t') aniblastall_jspecies.to_csv(os.path.join(outdirname, 'ANIblastall_jspecies.tab'), sep='\t') aniblastall_diff.to_csv(os.path.join(outdirname, 'ANIblastall_diff.tab'), sep='\t') print("ANIblastall concordance test output placed in %s" % outdirname) print("ANIblastall PID:\n", aniblastall_pid) print("ANIblastall JSpecies:\n", aniblastall_jspecies) print("ANIblastall diff:\n", aniblastall_diff) # We'd like the absolute difference reported to be < ANIBLASTALL_THRESHOLD max_diff = aniblastall_diff.abs().values.max() print("Maximum difference for ANIblastall: %e" % max_diff) assert_less(max_diff, ANIB_THRESHOLD)
def test_anib_concordance(): """Test concordance of ANIb method with JSpecies output. This may take some time. Please be patient. """ # Make/check output directory mode = "ANIb" outdirname = make_outdir(mode) # Get dataframes of JSpecies output anib_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIb') # Identify our input files, and the total lengths of each organism seq infiles = pyani_files.get_fasta_files(INDIRNAME) org_lengths = pyani_files.get_sequence_lengths(infiles) # Test ANIb concordance: # Make fragments fragfiles, fraglengths = anib.fragment_FASTA_files(infiles, outdirname, pyani_config.FRAGSIZE) # Build databases cmdlist = anib.generate_blastdb_commands(infiles, outdirname, pyani_config.MAKEBLASTDB_DEFAULT, mode="ANIb") multiprocessing_run(cmdlist) # Run pairwise BLASTN cmdlist = anib.generate_blastn_commands(fragfiles, outdirname, pyani_config.BLASTN_DEFAULT, mode="ANIb") multiprocessing_run(cmdlist, verbose=False) # Process BLAST; the pid data is in anib_data[1] anib_data = anib.process_blast(outdirname, org_lengths, fraglengths, mode="ANIb") anib_pid = anib_data[1].sort(axis=0).sort(axis=1) * 100. index, columns = anib_pid.index, anib_pid.columns diffmat = anib_pid.as_matrix() - anib_jspecies.as_matrix() anib_diff = pd.DataFrame(diffmat, index=index, columns=columns) # Write dataframes to file, for reference anib_pid.to_csv(os.path.join(outdirname, 'ANIb_pid.tab'), sep='\t') anib_jspecies.to_csv(os.path.join(outdirname, 'ANIb_jspecies.tab'), sep='\t') anib_diff.to_csv(os.path.join(outdirname, 'ANIb_diff.tab'), sep='\t') print "ANIb concordance test output placed in %s" % outdirname print anib_pid, anib_jspecies, anib_diff # We'd like the absolute difference reported to be < ANIB_THRESHOLD max_diff = anib_diff.abs().values.max() print "Maximum difference for ANIb: %e" % max_diff assert_less(max_diff, ANIB_THRESHOLD)
def unified_anib(indirname,User_ID): # Build BLAST databases and run pairwise BLASTN # Fraglengths does not get reused with BLASTN os.mkdir(indirname+'{0}_out/'.format(User_ID)) os.system("chmod 777 {0}".format(indirname+'{0}_out'.format(User_ID))) logging.basicConfig(level=logging.DEBUG, filename="/home/linproject/Workspace/LIN_log/logfile_{0}".format(User_ID), filemode="a+", format="%(asctime)-15s %(levelname)-8s %(message)s") infiles = pyani_files.get_fasta_files(indirname) org_lengths = pyani_files.get_sequence_lengths(infiles) fragsize = pyani_config.FRAGSIZE filestems = pyani_config.ANIB_FILESTEMS filenames = os.listdir(indirname) for fname in filenames: if ' ' in os.path.abspath(fname): logging.error("File or directory '%s' contains whitespace" % fname) logging.error("This will cause issues with MUMmer and BLAST") logging.error("(exiting)") sys.exit(1) fragfiles, fraglengths = anib.fragment_FASTA_files(infiles, indirname+'{0}_out/'.format(User_ID), fragsize) # Export fragment lengths as JSON, in case we re-run BLASTALL with # --skip_blastn with open(os.path.join(indirname+'{0}_out/'.format(User_ID), 'fraglengths.json'), 'w') as outfile: json.dump(fraglengths, outfile) # Which executables are we using? format_exe = pyani_config.FORMATDB_DEFAULT blast_exe = pyani_config.BLASTALL_DEFAULT # Run BLAST database-building and executables from a jobgraph logging.info("Creating job dependency graph") jobgraph = anib.make_job_graph(infiles, fragfiles, indirname+'{0}_out/'.format(User_ID), format_exe, blast_exe, 'ANIblastall') logging.info("Running jobs with multiprocessing") logging.info("Running job dependency graph") cumval = run_mp.run_dependency_graph(jobgraph, verbose=False, logger=logging) if 0 < cumval: logging.warning("At least one BLAST run failed. " + "%s may fail." % 'ANIblastall') else: logging.info("All multiprocessing jobs complete.") # Process pairwise BLASTN output logging.info("Processing pairwise %s BLAST output." % 'ANIblastall') try: data = anib.process_blast(indirname+'{0}_out/'.format(User_ID), org_lengths, fraglengths=fraglengths, mode='ANIblastall') except ZeroDivisionError: logging.error("One or more BLAST output files has a problem.") if 0 < cumval: logging.error("This is possibly due to BLASTN run failure, " + "please investigate") else: logging.error("This is possibly due to ara BLASTN comparison " + "being too distant for use.") logging.error(last_exception()) return data[1]
def test_blastdir_processing(self): """parses directory of .blast_tab output.""" orglengths = pyani_files.get_sequence_lengths(self.infnames) fraglengths = anib.get_fraglength_dict(self.fragfiles) # ANIb result = anib.process_blast(self.anibdir, orglengths, fraglengths, mode="ANIb") assert_frame_equal( result.percentage_identity.sort_index(1).sort_index(), self.anibtgt.sort_index(1).sort_index()) # ANIblastall result = anib.process_blast(self.aniblastalldir, orglengths, fraglengths, mode="ANIblastall") assert_frame_equal( result.percentage_identity.sort_index(1).sort_index(), self.aniblastalltgt.sort_index(1).sort_index())
def test_parse_blastdir(anib_output_dir): """Parse directory of BLAST+ output.""" orglengths = pyani_files.get_sequence_lengths(anib_output_dir.infiles) fraglengths = anib.get_fraglength_dict(anib_output_dir.fragfiles) result = anib.process_blast(anib_output_dir.blastdir, orglengths, fraglengths, mode="ANIb") assert_frame_equal( result.percentage_identity.sort_index(1).sort_index(), anib_output_dir.blastresult.sort_index(1).sort_index(), )
def test_anib_concordance(self): """ANIb results concordant with JSpecies. We expect ANIb results to be quite different, as the BLASTN algorithm changed substantially between BLAST and BLAST+ """ # Perform ANIb on the input directory contents outdir = os.path.join(self.outdir, "blastn") os.makedirs(outdir, exist_ok=True) fragfiles, fraglengths = anib.fragment_fasta_files( self.infiles, outdir, self.fragsize) jobgraph = anib.make_job_graph( self.infiles, fragfiles, anib.make_blastcmd_builder("ANIb", outdir)) assert_equal(0, run_mp.run_dependency_graph(jobgraph)) results = anib.process_blast(outdir, self.orglengths, fraglengths, mode="ANIb") result_pid = results.percentage_identity result_pid.to_csv(os.path.join(self.outdir, "pyani_anib.tab"), sep="\t") # Compare JSpecies output to results. We do this in two blocks, # masked according to whether the expected result is greater than # 90% identity, or less than that threshold. # The complete difference matrix is written to output, though result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0 lo_result = result_pid.mask(result_pid >= 90).fillna(0) hi_result = result_pid.mask(result_pid < 90).fillna(0) lo_target = self.target["ANIb"].mask( self.target["ANIb"] >= 90).fillna(0) hi_target = self.target["ANIb"].mask( self.target["ANIb"] < 90).fillna(0) lo_diffmat = lo_result.as_matrix() - lo_target.as_matrix() hi_diffmat = hi_result.as_matrix() - hi_target.as_matrix() diffmat = result_pid.as_matrix() - self.target["ANIb"].as_matrix() lo_diff = pd.DataFrame(lo_diffmat, index=result_pid.index, columns=result_pid.columns) hi_diff = pd.DataFrame(hi_diffmat, index=result_pid.index, columns=result_pid.columns) anib_diff = pd.DataFrame(diffmat, index=result_pid.index, columns=result_pid.columns) anib_diff.to_csv(os.path.join(self.outdir, "pyani_anib_diff.tab"), sep="\t") assert_less(lo_diff.abs().values.max(), self.tolerance["ANIb_lo"]) assert_less(hi_diff.abs().values.max(), self.tolerance["ANIb_hi"])
def test_anib_concordance( paths_concordance_fna, path_concordance_jspecies, tolerance_anib_hi, tolerance_anib_lo, threshold_anib_lo_hi, fragment_length, tmp_path, ): """Check ANIb results are concordant with JSpecies. We expect ANIb results to be quite different, as the BLASTN algorithm changed substantially between BLAST and BLAST+ (the megaBLAST algorithm is now the default for BLASTN) """ # Get lengths of input genomes orglengths = pyani_files.get_sequence_lengths(paths_concordance_fna) # Build and run BLAST jobs fragfiles, fraglengths = anib.fragment_fasta_files( paths_concordance_fna, tmp_path, fragment_length ) jobgraph = anib.make_job_graph( paths_concordance_fna, fragfiles, anib.make_blastcmd_builder("ANIb", tmp_path) ) assert 0 == run_mp.run_dependency_graph(jobgraph) # Jobs must run correctly # Process BLAST output result_pid = anib.process_blast( tmp_path, orglengths, fraglengths, mode="ANIb" ).percentage_identity # Compare JSpecies output to results. We do this in two blocks, # masked according to whether the expected result is greater than # a threshold separating "low" from "high" identity comparisons. result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0 lo_result = result_pid.mask(result_pid >= threshold_anib_lo_hi).fillna(0).values hi_result = result_pid.mask(result_pid < threshold_anib_lo_hi).fillna(0).values tgt_pid = parse_jspecies(path_concordance_jspecies)["ANIb"] lo_target = tgt_pid.mask(tgt_pid >= threshold_anib_lo_hi).fillna(0).values hi_target = tgt_pid.mask(tgt_pid < threshold_anib_lo_hi).fillna(0).values assert (lo_result - lo_target, hi_result - hi_target) == ( pytest.approx(0, abs=tolerance_anib_lo), pytest.approx(0, abs=tolerance_anib_hi), )
def test_anib_concordance(self): """ANIb results concordant with JSpecies. We expect ANIb results to be quite different, as the BLASTN algorithm changed substantially between BLAST and BLAST+ """ # Perform ANIb on the input directory contents outdir = os.path.join(self.outdir, "blastn") os.makedirs(outdir, exist_ok=True) fragfiles, fraglengths = anib.fragment_fasta_files( self.infiles, outdir, self.fragsize ) jobgraph = anib.make_job_graph( self.infiles, fragfiles, anib.make_blastcmd_builder("ANIb", outdir) ) assert_equal(0, run_mp.run_dependency_graph(jobgraph)) results = anib.process_blast(outdir, self.orglengths, fraglengths, mode="ANIb") result_pid = results.percentage_identity result_pid.to_csv(os.path.join(self.outdir, "pyani_anib.tab"), sep="\t") # Compare JSpecies output to results. We do this in two blocks, # masked according to whether the expected result is greater than # 90% identity, or less than that threshold. # The complete difference matrix is written to output, though result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0 lo_result = result_pid.mask(result_pid >= 90).fillna(0) hi_result = result_pid.mask(result_pid < 90).fillna(0) lo_target = self.target["ANIb"].mask(self.target["ANIb"] >= 90).fillna(0) hi_target = self.target["ANIb"].mask(self.target["ANIb"] < 90).fillna(0) lo_diffmat = lo_result.values - lo_target.values hi_diffmat = hi_result.values - hi_target.values diffmat = result_pid.values - self.target["ANIb"].values lo_diff = pd.DataFrame( lo_diffmat, index=result_pid.index, columns=result_pid.columns ) hi_diff = pd.DataFrame( hi_diffmat, index=result_pid.index, columns=result_pid.columns ) anib_diff = pd.DataFrame( diffmat, index=result_pid.index, columns=result_pid.columns ) anib_diff.to_csv(os.path.join(self.outdir, "pyani_anib_diff.tab"), sep="\t") assert_less(lo_diff.abs().values.max(), self.tolerance["ANIb_lo"]) assert_less(hi_diff.abs().values.max(), self.tolerance["ANIb_hi"])
def unified_anib(infiles, org_lengths): """Calculate ANIb for files in input directory. - infiles - paths to each input file - org_lengths - dictionary of input sequence lengths, keyed by sequence Calculates ANI by the ANIb method, as described in Goris et al. (2007) Int J Syst Evol Micr 57: 81-91. doi:10.1099/ijs.0.64483-0. There are some minor differences depending on whether BLAST+ or legacy BLAST (BLASTALL) methods are used. All FASTA format files (selected by suffix) in the input directory are used to construct BLAST databases, placed in the output directory. Each file's contents are also split into sequence fragments of length options.fragsize, and the multiple FASTA file that results written to the output directory. These are BLASTNed, pairwise, against the databases. The BLAST output is interrogated for all fragment matches that cover at least 70% of the query sequence, with at least 30% nucleotide identity over the full length of the query sequence. This is an odd choice and doesn't correspond to the twilight zone limit as implied by Goris et al. We persist with their definition, however. Only these qualifying matches contribute to the total aligned length, and total aligned sequence identity used to calculate ANI. The results are processed to give matrices of aligned sequence length (aln_lengths.tab), similarity error counts (sim_errors.tab), ANIs (perc_ids.tab), and minimum aligned percentage (perc_aln.tab) of each genome, for each pairwise comparison. These are written to the output directory in plain text tab-separated format. """ logger.info("Running %s", args.method) blastdir = os.path.join(args.outdirname, ALIGNDIR[args.method]) logger.info("Writing BLAST output to %s", blastdir) # Build BLAST databases and run pairwise BLASTN if not args.skip_blastn: # Make sequence fragments logger.info("Fragmenting input files, and writing to %s", args.outdirname) # Fraglengths does not get reused with BLASTN fragfiles, fraglengths = anib.fragment_fasta_files( infiles, blastdir, args.fragsize) # Export fragment lengths as JSON, in case we re-run with --skip_blastn with open(os.path.join(blastdir, 'fraglengths.json'), 'w') as outfile: json.dump(fraglengths, outfile) # Which executables are we using? #if args.method == "ANIblastall": # format_exe = args.formatdb_exe # blast_exe = args.blastall_exe #else: # format_exe = args.makeblastdb_exe # blast_exe = args.blastn_exe # Run BLAST database-building and executables from a jobgraph logger.info("Creating job dependency graph") jobgraph = anib.make_job_graph( infiles, fragfiles, anib.make_blastcmd_builder(args.method, blastdir)) #jobgraph = anib.make_job_graph(infiles, fragfiles, blastdir, # format_exe, blast_exe, args.method, # jobprefix=args.jobprefix) if args.scheduler == 'multiprocessing': logger.info("Running jobs with multiprocessing") logger.info("Running job dependency graph") cumval = run_mp.run_dependency_graph(jobgraph, logger=logger) if 0 < cumval: logger.warning( "At least one BLAST run failed. " + "%s may fail.", args.method) else: logger.info("All multiprocessing jobs complete.") else: run_sge.run_dependency_graph(jobgraph, logger=logger) logger.info("Running jobs with SGE") else: # Import fragment lengths from JSON if args.method == "ANIblastall": with open(os.path.join(blastdir, 'fraglengths.json'), 'rU') as infile: fraglengths = json.load(infile) else: fraglengths = None logger.warning("Skipping BLASTN runs (as instructed)!") # Process pairwise BLASTN output logger.info("Processing pairwise %s BLAST output.", args.method) try: data = anib.process_blast(blastdir, org_lengths, fraglengths=fraglengths, mode=args.method) except ZeroDivisionError: logger.error("One or more BLAST output files has a problem.") if not args.skip_blastn: if 0 < cumval: logger.error("This is possibly due to BLASTN run failure, " + "please investigate") else: logger.error("This is possibly due to a BLASTN comparison " + "being too distant for use.") logger.error(last_exception()) if not args.nocompress: logger.info("Compressing/deleting %s", blastdir) compress_delete_outdir(blastdir) # Return processed BLAST data return data
def unified_anib(args: Namespace, infiles: List[Path], org_lengths: Dict[str, int]) -> pyani_tools.ANIResults: """Calculate ANIb for files in input directory. :param args: Namespace of command-line options :param logger: logging object :param infiles: iterable of paths to each input file :param org_lengths: dict of input sequence lengths keyed by sequence name Calculates ANI by the ANIb method, as described in Goris et al. (2007) Int J Syst Evol Micr 57: 81-91. doi:10.1099/ijs.0.64483-0. There are some minor differences depending on whether BLAST+ or legacy BLAST (BLASTALL) methods are used. All FASTA format files (selected by suffix) in the input directory are used to construct BLAST databases, placed in the output directory. Each file's contents are also split into sequence fragments of length options.fragsize, and the multiple FASTA file that results written to the output directory. These are BLASTNed, pairwise, against the databases. The BLAST output is interrogated for all fragment matches that cover at least 70% of the query sequence, with at least 30% nucleotide identity over the full length of the query sequence. This is an odd choice and doesn't correspond to the twilight zone limit as implied by Goris et al. We persist with their definition, however. Only these qualifying matches contribute to the total aligned length, and total aligned sequence identity used to calculate ANI. The results are processed to give matrices of aligned sequence length (aln_lengths.tab), similarity error counts (sim_errors.tab), ANIs (perc_ids.tab), and minimum aligned percentage (perc_aln.tab) of each genome, for each pairwise comparison. These are written to the output directory in plain text tab-separated format. """ logger = logging.getLogger(__name__) logger.info("Running %s", args.method) blastdir = args.outdirname / ALIGNDIR[args.method] logger.info("Writing BLAST output to %s", blastdir) # Build BLAST databases and run pairwise BLASTN cumval, fraglengths = run_blast(args, logger, infiles, blastdir) # Process pairwise BLASTN output logger.info("Processing pairwise %s BLAST output.", args.method) try: data = anib.process_blast(blastdir, org_lengths, fraglengths=fraglengths, mode=args.method) except ZeroDivisionError: logger.error("One or more BLAST output files has a problem.") if not args.skip_blastn: if cumval > 0: logger.error( "This is possibly due to BLASTN run failure, please investigate", exc_info=True, ) else: logger.error( "This is possibly due to a BLASTN comparison being too distant for use.", exc_info=True, ) if not args.nocompress: logger.info("Compressing/deleting %s", blastdir) compress_delete_outdir(blastdir, logger) # Return processed BLAST data return data
def unified_anib(infiles, org_lengths): """Calculate ANIb for files in input directory. - infiles - paths to each input file - org_lengths - dictionary of input sequence lengths, keyed by sequence Calculates ANI by the ANIb method, as described in Goris et al. (2007) Int J Syst Evol Micr 57: 81-91. doi:10.1099/ijs.0.64483-0. There are some minor differences depending on whether BLAST+ or legacy BLAST (BLASTALL) methods are used. All FASTA format files (selected by suffix) in the input directory are used to construct BLAST databases, placed in the output directory. Each file's contents are also split into sequence fragments of length options.fragsize, and the multiple FASTA file that results written to the output directory. These are BLASTNed, pairwise, against the databases. The BLAST output is interrogated for all fragment matches that cover at least 70% of the query sequence, with at least 30% nucleotide identity over the full length of the query sequence. This is an odd choice and doesn't correspond to the twilight zone limit as implied by Goris et al. We persist with their definition, however. Only these qualifying matches contribute to the total aligned length, and total aligned sequence identity used to calculate ANI. The results are processed to give matrices of aligned sequence length (aln_lengths.tab), similarity error counts (sim_errors.tab), ANIs (perc_ids.tab), and minimum aligned percentage (perc_aln.tab) of each genome, for each pairwise comparison. These are written to the output directory in plain text tab-separated format. """ logger.info("Running %s" % args.method) # Build BLAST databases and run pairwise BLASTN if not args.skip_blastn: # Make sequence fragments logger.info("Fragmenting input files, and writing to %s" % args.outdirname) # Fraglengths does not get reused with BLASTN fragfiles, fraglengths = anib.fragment_FASTA_files(infiles, args.outdirname, args.fragsize) # Export fragment lengths as JSON, in case we re-run BLASTALL with # --skip_blastn if args.method == "ANIblastall": with open(os.path.join(args.outdirname, 'fraglengths.json'), 'w') as outfile: json.dump(fraglengths, outfile) # Which executables are we using? if args.method == "ANIblastall": format_exe = args.formatdb_exe blast_exe = args.blastall_exe else: format_exe = args.makeblastdb_exe blast_exe = args.blastn_exe # Run BLAST database-building and executables from a jobgraph logger.info("Creating job dependency graph") jobgraph = anib.make_job_graph(infiles, fragfiles, args.outdirname, format_exe, blast_exe, args.method) if args.scheduler == 'multiprocessing': logger.info("Running jobs with multiprocessing") logger.info("Running job dependency graph") cumval = run_mp.run_dependency_graph(jobgraph, verbose=args.verbose, logger=logger) if 0 < cumval: logger.warning("At least one BLAST run failed. " + "%s may fail." % args.method) else: logger.info("All multiprocessing jobs complete.") else: run_sge.run_dependency_graph(jobgraph, verbose=args.verbose, logger=logger) logger.info("Running jobs with SGE") else: # Import fragment lengths from JSON if args.method == "ANIblastall": with open(os.path.join(args.outdirname, 'fraglengths.json'), 'rU') as infile: fraglengths = json.load(infile) else: fraglengths = None logger.warning("Skipping BLASTN runs (as instructed)!") # Process pairwise BLASTN output logger.info("Processing pairwise %s BLAST output." % args.method) try: data = anib.process_blast(args.outdirname, org_lengths, fraglengths=fraglengths, mode=args.method) except ZeroDivisionError: logger.error("One or more BLAST output files has a problem.") if not args.skip_blastn: if 0 < cumval: logger.error("This is possibly due to BLASTN run failure, " + "please investigate") else: logger.error("This is possibly due to a BLASTN comparison " + "being too distant for use.") logger.error(last_exception()) return data