def cluster_new_reference_sequences(update_tree, args, new_ref_seqs_fasta): logging.info("Clustering sequences at %s percent identity with USEARCH... " % str(update_tree.cluster_id)) usearch_command = [args.executables["usearch"]] usearch_command += ["-sortbylength", new_ref_seqs_fasta] usearch_command += ["-fastaout", update_tree.Output + "usearch_sorted.fasta"] usearch_command += ["--log", update_tree.Output + os.sep + "usearch_sort.log"] # usearch_command += ["1>", "/dev/null", "2>", "/dev/null"] launch_write_command(usearch_command) uclust_id = "0." + str(int(update_tree.cluster_id)) try: float(uclust_id) except ValueError: logging.error("Weird formatting of cluster_id: " + uclust_id + "\n") uclust_command = [args.executables["usearch"]] uclust_command += ["-cluster_fast", update_tree.Output + "usearch_sorted.fasta"] uclust_command += ["--id", uclust_id] uclust_command += ["--centroids", update_tree.Output + "uclust_" + update_tree.COG + ".fasta"] uclust_command += ["--uc", update_tree.Output + "uclust_" + update_tree.COG + ".uc"] uclust_command += ["--log", update_tree.Output + os.sep + "usearch_cluster.log"] # uclust_command += ["1>", "/dev/null", "2>", "/dev/null"] launch_write_command(uclust_command) logging.info("done.\n") return
def run_mafft(mafft_exe: str, fasta_in: str, fasta_out: str, num_threads): """ Wrapper function for the MAFFT multiple sequence alignment tool. Runs MAFFT using `--auto` and checks if the output is empty. :param mafft_exe: Path to the executable for mafft :param fasta_in: An unaligned FASTA file :param fasta_out: The path to a file MAFFT will write aligned sequences to :param num_threads: Integer (or string) for the number of threads MAFFT can use :return: """ mafft_align_command = [mafft_exe] mafft_align_command += ["--maxiterate", str(1000)] mafft_align_command += ["--thread", str(num_threads)] mafft_align_command.append("--auto") mafft_align_command += ["--randomseed", str(12345)] mafft_align_command += [fasta_in, '1>' + fasta_out] mafft_align_command += ["2>", "/dev/null"] stdout, mafft_proc_returncode = launch_write_command(mafft_align_command, False) if mafft_proc_returncode != 0: logging.error("Multiple sequence alignment using " + mafft_exe + " did not complete successfully! Command used:\n" + ' '.join(mafft_align_command) + "\n") sys.exit(7) else: mfa = read_fasta_to_dict(fasta_out) if len(mfa.keys()) < 1: logging.error("MAFFT did not generate a proper FASTA file. " + "Check the output by running:\n" + ' '.join(mafft_align_command) + "\n") sys.exit(7) return
def run_hmmsearch(hmmsearch_exe: str, hmm_profile: str, query_fasta: str, output_dir: str, num_threads=2): """ Function for searching a fasta file with an hmm profile :param hmmsearch_exe: Path to the executable for hmmsearch :param hmm_profile: Path to the HMM profile file :param query_fasta: Path to the FASTA file to be queried by the profile :param output_dir: Path to the directory for writing the outputs :param num_threads: Number of threads to be used by hmmsearch :return: """ # Find the name of the HMM. Use it to name the output file rp_marker = re.sub(".hmm", '', os.path.basename(hmm_profile)) domtbl = output_dir + rp_marker + "_to_ORFs_domtbl.txt" # Basic hmmsearch command hmmsearch_command_base = [hmmsearch_exe] hmmsearch_command_base += ["--cpu", str(num_threads)] hmmsearch_command_base.append("--noali") # Customize the command for this input and HMM final_hmmsearch_command = hmmsearch_command_base + ["--domtblout", domtbl] final_hmmsearch_command += [hmm_profile, query_fasta] stdout, ret_code = launch_write_command(final_hmmsearch_command) # Check to ensure the job finished properly if ret_code != 0: logging.error("hmmsearch did not complete successfully! Output:\n" + stdout + "\n" + "Command used:\n" + ' '.join(final_hmmsearch_command) + "\n") sys.exit(13) return [domtbl]
def cluster_sequences(uclust_exe, fasta_input, uclust_prefix, similarity=0.60): """ Wrapper function for clustering a FASTA file at some similarity using usearch's cluster_fast algorithm :param uclust_exe: Path to the usearch executable :param fasta_input: FASTA file for which contained sequences will be clustered :param uclust_prefix: Prefix for the output files :param similarity: The proportional similarity to cluster input sequences :return: None """ logging.info("Clustering sequences with UCLUST... ") uclust_cmd = [uclust_exe] uclust_cmd += ["-cluster_fast", fasta_input] uclust_cmd += ["-id", str(similarity)] uclust_cmd += ["-sort", "length"] uclust_cmd += ["-centroids", uclust_prefix + ".fa"] uclust_cmd += ["--uc", uclust_prefix + ".uc"] logging.info("done.\n") stdout, returncode = launch_write_command(uclust_cmd) if returncode != 0: logging.error("UCLUST did not complete successfully! Command used:\n" + ' '.join(uclust_cmd) + "\n") sys.exit(13) logging.debug(stdout) return
def build_hmm_profile(hmmbuild_exe, msa_in, output_hmm): logging.debug("Building HMM profile... ") hmm_build_command = [hmmbuild_exe, output_hmm, msa_in] stdout, hmmbuild_pro_returncode = launch_write_command(hmm_build_command) logging.debug("done.\n") if hmmbuild_pro_returncode != 0: logging.error("hmmbuild did not complete successfully for:\n" + ' '.join(hmm_build_command) + "\n") sys.exit(7)
def run_prodigal(args, fasta_file, output_file, nucleotide_orfs=None): prodigal_command = [args.executables["prodigal"]] prodigal_command += ["-i", fasta_file] prodigal_command += ["-p", "meta"] prodigal_command += ["-a", output_file] if nucleotide_orfs: prodigal_command += ["-d", nucleotide_orfs] stdout, proc_code = launch_write_command(prodigal_command) if proc_code != 0: logging.error("Prodigal did not complete successfully!\n" + "Command used:\n" + ' '.join(prodigal_command), "err", "\n") sys.exit(3) return
def run_papara(executable, tree_file, ref_alignment_phy, query_fasta, molecule): papara_command = [executable] papara_command += ["-t", tree_file] papara_command += ["-s", ref_alignment_phy] papara_command += ["-q", query_fasta] if molecule == "prot": papara_command.append("-a") stdout, ret_code = launch_write_command(papara_command) if ret_code != 0: logging.error("PaPaRa did not complete successfully!\n" + "Command used:\n" + ' '.join(papara_command) + "\n") sys.exit(3) return stdout
def run_odseq(odseq_exe, fasta_in, outliers_fa, num_threads): odseq_command = [odseq_exe] odseq_command += ["-i", fasta_in] odseq_command += ["-f", "fasta"] odseq_command += ["-o", outliers_fa] odseq_command += ["-m", "linear"] odseq_command += ["--boot-rep", str(1000)] odseq_command += ["--threads", str(num_threads)] odseq_command += ["--score", str(5)] odseq_command.append("--full") stdout, odseq_proc_returncode = launch_write_command(odseq_command) if odseq_proc_returncode != 0: logging.error("Outlier detection using " + odseq_exe + " did not complete successfully! Command used:\n" + ' '.join(odseq_command) + "\n") sys.exit(7) return
def profile_aligner(executables, ref_aln, ref_profile, input_fasta, output_sto, kind="functional"): """ A wrapper for both cmalign and hmmalign for performing profile-based multiple sequence alignment :param executables: A dictionary containing keys "cmalign" and "hmmalign" :param ref_aln: Path to a FASTA or Stockholm file with the multiple alignment pattern :param ref_profile: Path to the HMM or CM profile for the reference gene :param input_fasta: Path to the FASTA containing query sequences :param output_sto: Name of the output Stockholm formatted file :param kind: The type of marker gene being analyzed [functional (default), phylogenetic, phylogenetic_rRNA] :return: """ if kind == "phylogenetic_rRNA": malign_command = hmmalign_command(executables["cmalign"], ref_aln, ref_profile, input_fasta, output_sto) else: malign_command = hmmalign_command(executables["hmmalign"], ref_aln, ref_profile, input_fasta, output_sto) stdout, returncode = launch_write_command(malign_command) if returncode != 0: logging.error("Multiple alignment failed for " + input_fasta + ". Command used:\n" + ' '.join(malign_command) + " output:\n" + stdout + "\n") sys.exit(3) return stdout
def generate_blast_database(args, fasta, molecule, prefix, multiple=True): """ :param args: :param fasta: File to make a BLAST database for :param molecule: 'prot' or 'nucl' - necessary argument for makeblastdb :param prefix: prefix string for the output BLAST database :param multiple: Flag indicating the input `fasta` is a MSA. Alignment information is removed prior to makeblastdb :return: """ # Remove the multiple alignment information from fasta_replaced_file and write to fasta_mltree blastdb_out = prefix + ".fa" if multiple: if blastdb_out == fasta: logging.error("prefix.fa is the same as " + fasta + " and would be overwritten!\n") sys.exit(13) remove_dashes_from_msa(fasta, blastdb_out) blastdb_in = blastdb_out else: blastdb_in = fasta logging.info("Making the BLAST database for " + blastdb_in + "... ") # Format the `makeblastdb` command makeblastdb_command = [args.executables["makeblastdb"]] makeblastdb_command += ["-in", blastdb_in] makeblastdb_command += ["-out", blastdb_out] makeblastdb_command += ["-input_type", "fasta"] makeblastdb_command += ["-dbtype", molecule] # Launch the command stdout, makeblastdb_pro_returncode = launch_write_command(makeblastdb_command) logging.info("done\n") return stdout, blastdb_out
def construct_tree(executables: dict, molecule: str, multiple_alignment_file: str, tree_output_dir, tree_file, tree_prefix, args): """ Wrapper script for generating phylogenetic trees with either RAxML or FastTree from a multiple alignment :param executables: Dictionary containing paths to executables, crucially FastTree and RAxML :param molecule: Molecule type of the sequences being used to infer the phylogeny :param multiple_alignment_file: Path to the multiple sequence alignment file :param tree_output_dir: Path to the directory where output files should be written to :param tree_file: Path to write the inferred phylogenetic tree :param tree_prefix: Prefix to be used for the outputs :param args: Command-line arguments parsed using ArgParse :return: Stylized name of the tree-building software used """ # Decide on the command to build the tree, make some directories and files when necessary if args.fast: tree_build_cmd = [executables["FastTree"]] if molecule == "rrna" or molecule == "dna": tree_build_cmd += ["-nt", "-gtr"] else: tree_build_cmd += ["-lg", "-wag"] tree_build_cmd += ["-out", tree_file] tree_build_cmd.append(multiple_alignment_file) tree_builder = "FastTree" else: tree_build_cmd = [executables["raxmlHPC"]] tree_build_cmd += ["-f", "a"] tree_build_cmd += ["-p", "12345"] tree_build_cmd += ["-x", "12345"] tree_build_cmd += ["-#", str(args.bootstraps)] tree_build_cmd += ["-s", multiple_alignment_file] tree_build_cmd += ["-n", tree_prefix] tree_build_cmd += ["-w", tree_output_dir] tree_build_cmd += ["-T", str(args.num_threads)] if args.raxml_model: tree_build_cmd += ["-m", args.raxml_model] elif args.molecule == "prot": tree_build_cmd += ["-m", "PROTGAMMAAUTO"] elif args.molecule == "rrna" or molecule == "dna": tree_build_cmd += ["-m", "GTRGAMMA"] else: logging.error("A substitution model could not be specified with the 'molecule' argument: " + args.molecule) sys.exit(13) tree_builder = "RAxML" # Ensure the tree from a previous run isn't going to be over-written if not os.path.exists(tree_output_dir): os.makedirs(tree_output_dir) else: logging.error(tree_output_dir + " already exists from a previous run! " + "Please delete or rename it and try again.\n") sys.exit(13) logging.info("Building phylogenetic tree with " + tree_builder + "... ") if args.fast: stdout, returncode = launch_write_command(tree_build_cmd, True) with open(tree_output_dir + os.sep + "FastTree_info." + tree_prefix, 'w') as fast_info: fast_info.write(stdout + "\n") else: stdout, returncode = launch_write_command(tree_build_cmd, False) logging.info("done.\n") if returncode != 0: logging.error(tree_builder + " did not complete successfully! " + "Look in " + tree_output_dir + os.sep + tree_builder + "_info." + tree_prefix + " for an error message.\n" + tree_builder + " command used:\n" + ' '.join(tree_build_cmd) + "\n") sys.exit(13) return tree_builder
def raxml_evolutionary_placement(raxml_exe: str, reference_tree_file: str, multiple_alignment: str, model: str, output_dir: str, query_name: str, num_threads=2): """ A wrapper for RAxML's evolutionary placement algorithm (EPA) 1. checks to ensure the output files do not already exist, and removes them if they do 2. ensures the output directory is an absolute path, satisfying RAxML 3. Runs RAxML with the provided parameters 4. Renames the files for consistency in TreeSAPP :param raxml_exe: Path to the RAxML executable to be used :param reference_tree_file: The reference tree for evolutionary placement to operate on :param multiple_alignment: Path to a multiple alignment file containing reference and query sequences :param model: The substitution model to be used by RAxML e.g. PROTGAMMALG, GTRCAT :param output_dir: Path to write the EPA outputs :param query_name: Prefix name for all of the output files :param num_threads: Number of threads EPA should use (default = 2) :return: A dictionary of files written by RAxML's EPA that are used by TreeSAPP. For example epa_files["jplace"] """ epa_files = dict() ## # Start with some housekeeping - are the inputs looking alright? # Do the outputs already exist? # Is the output directory an absolute path? ## if not os.path.isabs(output_dir): output_dir = os.getcwd() + os.sep + output_dir if output_dir[-1] != os.sep: output_dir += os.sep if model is None: logging.error("No substitution model provided for evolutionary placement of " + query_name + ".\n") raise AssertionError() # Determine the output file names, and remove any pre-existing output files if not isinstance(reference_tree_file, str): logging.error(str(reference_tree_file) + " is not string but " + str(type(reference_tree_file)) + "\n") raise AssertionError() if len(reference_tree_file) == 0: logging.error("Could not find reference tree for " + query_name + " to be used by EPA.\n") raise AssertionError() # This is the final set of files that will be written by RAxML's EPA algorithm epa_files["stdout"] = output_dir + query_name + '_RAxML.txt' epa_info = output_dir + 'RAxML_info.' + query_name epa_files["info"] = output_dir + query_name + '.RAxML_info.txt' epa_labelled_tree = output_dir + 'RAxML_labelledTree.' + query_name epa_tree = output_dir + 'RAxML_originalLabelledTree.' + query_name epa_files["tree"] = output_dir + query_name + '.originalRAxML_labelledTree.txt' epa_classification = output_dir + 'RAxML_classification.' + query_name epa_files["classification"] = output_dir + query_name + '.RAxML_classification.txt' epa_files["jplace"] = output_dir + "RAxML_portableTree." + query_name + ".jplace" epa_entropy = output_dir + "RAxML_entropy." + query_name epa_weights = output_dir + "RAxML_classificationLikelihoodWeights." + query_name for raxml_file in [epa_info, epa_labelled_tree, epa_tree, epa_classification, epa_entropy, epa_weights]: try: os.remove(raxml_file) except OSError: pass # Set up the command to run RAxML raxml_command = [raxml_exe, '-m', model, '-T', str(int(num_threads)), '-s', multiple_alignment, '-t', reference_tree_file, '-G', str(0.2), "--epa-prob-threshold=" + str(0.10), '-f', 'v', '-n', query_name, '-w', output_dir, '>', epa_files["stdout"]] launch_write_command(raxml_command) # Rename the RAxML output files if os.path.exists(epa_info): copy(epa_info, epa_files["info"]) os.remove(epa_info) if os.path.exists(epa_classification): copy(epa_classification, epa_files["classification"]) os.remove(epa_classification) if os.path.exists(epa_tree): copy(epa_tree, epa_files["tree"]) os.remove(epa_tree) else: logging.error("Some files were not successfully created for " + query_name + "\n" + "Check " + epa_files["stdout"] + " for an error!\n") sys.exit(3) # Remove useless files if os.path.exists(epa_labelled_tree): os.remove(epa_labelled_tree) os.remove(epa_weights) os.remove(epa_entropy) return epa_files
def mcc_calculator(): args = get_arguments() log_name = args.output + os.sep + "MCC_log.txt" mcc_file = args.output + os.sep + "MCC_table.tsv" summary_rank = "Phylum" taxa_dist_output = args.output + '.'.join( os.path.basename( args.input).split('.')[:-1]) + '_' + summary_rank + "_dist.tsv" classification_info_output = args.output + '.'.join( os.path.basename(args.input).split('.')[:-1]) + "_classifications.tsv" prep_logging(log_name, args.verbose) logging.info( "\n##\t\t\tBeginning Matthews Correlation Coefficient analysis\t\t\t##\n" ) validate_command(args, sys.argv) ## # Read the file mapping reference package name to the database annotations ## pkg_name_dict = read_annotation_mapping_file(args.annot_map) marker_build_dict = file_parsers.parse_ref_build_params(args.treesapp, []) test_obj = ConfusionTest(pkg_name_dict.keys()) test_obj.map_data(output_dir=args.output, tool=args.tool) if args.overwrite and os.path.exists(test_obj.data_dir): shutil.rmtree(test_obj.data_dir) ## # Load the taxonomic trie for each reference package ## if args.tool == "treesapp": for pkg_name in test_obj.ref_packages: refpkg = test_obj.ref_packages[pkg_name] marker = marker_build_dict[pkg_name].cog refpkg.prefix = marker refpkg.gather_package_files(args.pkg_path) test_obj.ref_packages[ pkg_name].taxa_trie = all_possible_assignments( test_obj.ref_packages[pkg_name].lineage_ids) else: for gpkg in glob(args.gpkg_dir + "*gpkg"): marker = str(os.path.basename(gpkg).split('.')[0]) pkg_name = fish_refpkg_from_build_params( marker, marker_build_dict).denominator if pkg_name in pkg_name_dict: try: tax_ids_file = glob( os.sep.join([ gpkg, marker + ".gpkg.refpkg", marker + "*_taxonomy.csv" ])).pop() test_obj.ref_packages[ pkg_name].taxa_trie = grab_graftm_taxa(tax_ids_file) except IndexError: logging.warning("No GraftM taxonomy file found for " + marker + ". Is this refpkg incomplete?\n") ## # Run the specified taxonomic analysis tool and collect the classifications ## assignments = {} test_fa_prefix = '.'.join(os.path.basename(args.input).split('.')[:-1]) if args.tool == "treesapp": ref_pkgs = ','.join(pkg_name_dict.keys()) classification_table = os.sep.join([ args.output, "TreeSAPP_output", "final_outputs", "marker_contig_map.tsv" ]) if not os.path.isfile(classification_table): classify_args = [ "-i", args.input, "-t", ref_pkgs, "-n", str(args.num_threads), "-m", "prot", "--output", test_obj.data_dir, "--trim_align", "--overwrite" ] assign(classify_args) classification_lines = file_parsers.read_marker_classification_table( classification_table) assignments = file_parsers.parse_assignments(classification_lines) else: # Since you are only able to analyze a single reference package at a time with GraftM, this is ran iteratively for gpkg in glob(args.gpkg_dir + "*gpkg"): marker = str(os.path.basename(gpkg).split('.')[0]) if not marker: logging.error("Unable to parse marker name from gpkg '" + gpkg + "'\n") sys.exit(5) pkg_name = fish_refpkg_from_build_params( marker, marker_build_dict).denominator if pkg_name not in pkg_name_dict: logging.warning("'" + pkg_name + "' not in " + args.annot_map + " and will be skipped...\n") continue output_dir = test_obj.data_dir + pkg_name + os.sep if not os.path.isdir(output_dir): os.makedirs(output_dir) classification_table = output_dir + test_fa_prefix + os.sep + test_fa_prefix + "_read_tax.tsv" if not os.path.isfile(classification_table): classify_call = [ "graftM", "graft", "--forward", args.input, "--graftm_package", gpkg, "--input_sequence_type", "aminoacid", "--threads", str(args.num_threads), "--output_directory", output_dir, "--force" ] if args.tool == "diamond": classify_call += ["--assignment_method", "diamond"] classify_call += ["--search_method", "diamond"] launch_write_command(classify_call, False) assignments[marker] = file_parsers.read_graftm_classifications( classification_table) if len(assignments) == 0: logging.error("No sequences were classified by " + args.tool + ".\n") sys.exit(3) logging.info("Reading headers in " + args.input + "... ") test_seq_names = [ seq_name[1:] if seq_name[0] == '>' else seq_name for seq_name in get_headers(args.input) ] logging.info("done.\n") test_obj.num_total_queries = len(test_seq_names) eggnog_re = re.compile(r"^>?(COG[A-Z0-9]+|ENOG[A-Z0-9]+)_(\d+)\..*") test_obj.header_regex = eggnog_re ## # Bin the test sequence names into their respective confusion categories (TP, TN, FP, FN) ## test_obj.bin_headers(test_seq_names, assignments, pkg_name_dict, marker_build_dict) test_seq_names.clear() ## # Parse the taxonomic IDs from EggNOG headers and map taxonomic lineage information to classified sequences ## _TAXID_GROUP = 2 test_obj.retrieve_lineages(_TAXID_GROUP) test_obj.map_lineages() test_obj.bin_true_positives_by_taxdist() test_obj.validate_false_positives() test_obj.validate_false_negatives(pkg_name_dict) test_obj.summarise_reference_taxa(taxa_dist_output, classification_info_output, summary_rank) logging.debug(test_obj.summarize_type_two_taxa(summary_rank)) logging.debug(test_obj.true_positive_taxonomic_summary(summary_rank, True)) ## # Report the MCC score across different taxonomic distances - should increase with greater allowed distance ## test_obj._MAX_TAX_DIST = 6 logging.debug(test_obj.get_info(True)) d = 0 mcc_string = "Tax.dist\tMCC\tTrue.Pos\tTrue.Neg\tFalse.Pos\tFalse.Neg\n" while d < 8: test_obj._MAX_TAX_DIST = d tp, remainder = test_obj.get_true_positives_at_dist() num_tp = len(tp) num_fp = len(test_obj.get_false_positives()) + len(remainder) num_fn = len(test_obj.get_false_negatives()) num_tn = test_obj.get_true_negatives() mcc = calculate_matthews_correlation_coefficient( num_tp, num_fp, num_fn, num_tn) mcc_string += "\t".join( [str(x) for x in [d, mcc, num_tp, num_tn, num_fp, num_fn]]) + "\n" d += 1 logging.info(mcc_string) with open(mcc_file, 'w') as mcc_handler: mcc_handler.write(mcc_string) return