def manage_refpkgs(sys_args): args = get_arguments(sys_args) if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) prep_logging(log_file_name=os.path.join(args.output_dir, "log_refpkg_manager_" + dt.now().strftime("%Y-%m-%d") + ".txt"), verbosity=False) refpkg_dirs = gather_refpkg_dirs() ref_packages = instantiate_refpkgs(refpkg_dirs) # TODO: Differentiate reference packages found in directories made by treesapp update if args.list: list_refpkgs(ref_packages) return if args.name: filter_refpkgs_by_name(ref_packages, args.name) if args.template: build_templates(ref_packages, scheduler=args.scheduler, template=args.template) if args.update: rebuild_gather_reference_packages(ref_packages, args.output_dir) if args.copy: # Copy reference package pickle files if not args.treesapp_install: logging.error("TreeSAPP installation directory was not provided. Not sure where to copy the refpkgs!\n") return copy_refpkg_files_to_treesapp(ref_packages, args.treesapp_install) return
def main(): args = get_options() prep_logging() stunted_fa = read_fasta_to_dict(args.old_fasta) header_map = read_name_map(args.name_map) new_fa = find_replace_headers(stunted_fa, header_map) write_new_fasta(new_fa, args.output)
def main(): args = get_arguments() log_file_name = os.path.abspath("./TreeSAPP_auto-colour_log.txt") prep_logging(log_file_name, args.verbose) logging.info("\n##\t\t\tGenerating colour-style file for iTOL\t\t\t##\n") validate_command(args) taxa_colours = init_taxa_colours(args) # type: TaxaColours read_tax_ids_file(taxa_colours) target_depth = find_rank_depth(args) taxa_colours.get_clades(target_depth) if args.taxa_filter: taxa_colours.filter_unwanted_taxa(args.taxa_filter, target_depth) if args.no_poly: # Optionally not colour polyphyletic clades based on args.no_poly taxa_colours.filter_polyphyletic_groups() if args.min_prop: taxa_colours.filter_rare_groups(args.min_prop) leaf_order = linearize_tree_leaves(args.tree) colours = get_colours(args, taxa_colours, args.palette) # Sort the nodes by their internal node order taxa_order = order_taxa(taxa_colours.taxon_leaf_map, leaf_order) palette_taxa_map = map_colours_to_taxa(taxa_order, colours) write_colours_styles(taxa_colours, palette_taxa_map) # Find the minimum set of monophyletic internal nodes for each taxon taxa_clades = taxa_colours.find_mono_clades() taxa_ranges = convert_clades_to_ranges(taxa_clades, leaf_order) write_colour_strip(taxa_ranges, palette_taxa_map, taxa_colours.strip_output)
def main(): prep_logging(log_file_name=os.path.join( os.getcwd(), "refpkgs_" + dt.now().strftime("%Y-%m-%d") + "_log.txt"), verbosity=False) args = get_arguments() refpkg_dirs = gather_refpkg_dirs() ref_packages = instantiate_refpkgs(refpkg_dirs) # TODO: Differentiate reference packages found in directories made by treesapp update if args.list: list_refpkgs(ref_packages) return if args.name: filter_refpkgs_by_name(ref_packages, args.name) if args.template: build_templates(ref_packages, args.template) if args.update: rebuild_gather_reference_packages(ref_packages) if args.copy: # Copy reference package pickle files if not args.treesapp_install: logging.error( "TreeSAPP installation directory was not provided. Not sure where to copy the refpkgs!\n" ) return copy_refpkg_files_to_treesapp(ref_packages, args.treesapp_install) return
def main(): args = get_arguments() prep_logging() uc_dict = read_uc(args.uc) ref_seq_names = read_seq_names_list(args.seq_names) equivalogs = get_equivalogs(uc_dict, ref_seq_names) write_equivalogs(equivalogs, args.output) return
def main(): args = get_options() log_file_name = "Efetch_fasta_log.txt" prep_logging(log_file_name, args.verbose) if args.format == "stockholm": accessions = read_stockholm(args) elif args.format == "fasta": accessions = set() input_headers = get_headers(args.input) for header in input_headers: header_format_re, header_db, header_molecule = get_header_format( header) sequence_info = header_format_re.match(header) seq_info_tuple = return_sequence_info_groups( sequence_info, header_db, header) accessions.add(seq_info_tuple.accession) logging.debug( str(len(input_headers) - len(accessions)) + " duplicate accessions found in " + args.input + "\n") elif args.format == "list": accessions = read_accession_list(args) else: logging.error("Unrecognized file format '" + args.format + "'.\n") sys.exit(11) if len(accessions) == 0: logging.error("No accessions were read from '" + args.input + "'.\n") sys.exit(11) seq_record_list = fetch_sequences(args, accessions) # Generate a fasta dictionary from each of the seq_record objects fasta_dict = dict() failures = list() for seq_record in seq_record_list.values(): seq_dict = seq_record.fastafy(args.seq_out) if seq_dict: fasta_dict.update(seq_dict) else: failures.append(seq_record.get_desired_accession(args.molecule_in)) if failures: logging.info("Unable to fetch information from NCBI for " + str(len(failures)) + '/' + str(len(accessions)) + ":\n" + '\n'.join(failures) + "\n") write_fasta(args, fasta_dict)
def main(): args = get_arguments() prep_logging() sto_dict = read_stockholm(args.sto_file) write_sto_table(sto_dict, args.tbl_out) return
def mcc_calculator(): args = get_arguments() log_name = args.output + os.sep + "MCC_log.txt" mcc_file = args.output + os.sep + "MCC_table.tsv" summary_rank = "Phylum" taxa_dist_output = args.output + '.'.join( os.path.basename( args.input).split('.')[:-1]) + '_' + summary_rank + "_dist.tsv" classification_info_output = args.output + '.'.join( os.path.basename(args.input).split('.')[:-1]) + "_classifications.tsv" prep_logging(log_name, args.verbose) logging.info( "\n##\t\t\tBeginning Matthews Correlation Coefficient analysis\t\t\t##\n" ) validate_command(args, sys.argv) ## # Read the file mapping reference package name to the database annotations ## pkg_name_dict = read_annotation_mapping_file(args.annot_map) marker_build_dict = file_parsers.parse_ref_build_params(args.treesapp, []) test_obj = ConfusionTest(pkg_name_dict.keys()) test_obj.map_data(output_dir=args.output, tool=args.tool) if args.overwrite and os.path.exists(test_obj.data_dir): shutil.rmtree(test_obj.data_dir) ## # Load the taxonomic trie for each reference package ## if args.tool == "treesapp": for pkg_name in test_obj.ref_packages: refpkg = test_obj.ref_packages[pkg_name] marker = marker_build_dict[pkg_name].cog refpkg.prefix = marker refpkg.gather_package_files(args.pkg_path) test_obj.ref_packages[ pkg_name].taxa_trie = all_possible_assignments( test_obj.ref_packages[pkg_name].lineage_ids) else: for gpkg in glob(args.gpkg_dir + "*gpkg"): marker = str(os.path.basename(gpkg).split('.')[0]) pkg_name = fish_refpkg_from_build_params( marker, marker_build_dict).denominator if pkg_name in pkg_name_dict: try: tax_ids_file = glob( os.sep.join([ gpkg, marker + ".gpkg.refpkg", marker + "*_taxonomy.csv" ])).pop() test_obj.ref_packages[ pkg_name].taxa_trie = grab_graftm_taxa(tax_ids_file) except IndexError: logging.warning("No GraftM taxonomy file found for " + marker + ". Is this refpkg incomplete?\n") ## # Run the specified taxonomic analysis tool and collect the classifications ## assignments = {} test_fa_prefix = '.'.join(os.path.basename(args.input).split('.')[:-1]) if args.tool == "treesapp": ref_pkgs = ','.join(pkg_name_dict.keys()) classification_table = os.sep.join([ args.output, "TreeSAPP_output", "final_outputs", "marker_contig_map.tsv" ]) if not os.path.isfile(classification_table): classify_args = [ "-i", args.input, "-t", ref_pkgs, "-n", str(args.num_threads), "-m", "prot", "--output", test_obj.data_dir, "--trim_align", "--overwrite" ] assign(classify_args) classification_lines = file_parsers.read_marker_classification_table( classification_table) assignments = file_parsers.parse_assignments(classification_lines) else: # Since you are only able to analyze a single reference package at a time with GraftM, this is ran iteratively for gpkg in glob(args.gpkg_dir + "*gpkg"): marker = str(os.path.basename(gpkg).split('.')[0]) if not marker: logging.error("Unable to parse marker name from gpkg '" + gpkg + "'\n") sys.exit(5) pkg_name = fish_refpkg_from_build_params( marker, marker_build_dict).denominator if pkg_name not in pkg_name_dict: logging.warning("'" + pkg_name + "' not in " + args.annot_map + " and will be skipped...\n") continue output_dir = test_obj.data_dir + pkg_name + os.sep if not os.path.isdir(output_dir): os.makedirs(output_dir) classification_table = output_dir + test_fa_prefix + os.sep + test_fa_prefix + "_read_tax.tsv" if not os.path.isfile(classification_table): classify_call = [ "graftM", "graft", "--forward", args.input, "--graftm_package", gpkg, "--input_sequence_type", "aminoacid", "--threads", str(args.num_threads), "--output_directory", output_dir, "--force" ] if args.tool == "diamond": classify_call += ["--assignment_method", "diamond"] classify_call += ["--search_method", "diamond"] launch_write_command(classify_call, False) assignments[marker] = file_parsers.read_graftm_classifications( classification_table) if len(assignments) == 0: logging.error("No sequences were classified by " + args.tool + ".\n") sys.exit(3) logging.info("Reading headers in " + args.input + "... ") test_seq_names = [ seq_name[1:] if seq_name[0] == '>' else seq_name for seq_name in get_headers(args.input) ] logging.info("done.\n") test_obj.num_total_queries = len(test_seq_names) eggnog_re = re.compile(r"^>?(COG[A-Z0-9]+|ENOG[A-Z0-9]+)_(\d+)\..*") test_obj.header_regex = eggnog_re ## # Bin the test sequence names into their respective confusion categories (TP, TN, FP, FN) ## test_obj.bin_headers(test_seq_names, assignments, pkg_name_dict, marker_build_dict) test_seq_names.clear() ## # Parse the taxonomic IDs from EggNOG headers and map taxonomic lineage information to classified sequences ## _TAXID_GROUP = 2 test_obj.retrieve_lineages(_TAXID_GROUP) test_obj.map_lineages() test_obj.bin_true_positives_by_taxdist() test_obj.validate_false_positives() test_obj.validate_false_negatives(pkg_name_dict) test_obj.summarise_reference_taxa(taxa_dist_output, classification_info_output, summary_rank) logging.debug(test_obj.summarize_type_two_taxa(summary_rank)) logging.debug(test_obj.true_positive_taxonomic_summary(summary_rank, True)) ## # Report the MCC score across different taxonomic distances - should increase with greater allowed distance ## test_obj._MAX_TAX_DIST = 6 logging.debug(test_obj.get_info(True)) d = 0 mcc_string = "Tax.dist\tMCC\tTrue.Pos\tTrue.Neg\tFalse.Pos\tFalse.Neg\n" while d < 8: test_obj._MAX_TAX_DIST = d tp, remainder = test_obj.get_true_positives_at_dist() num_tp = len(tp) num_fp = len(test_obj.get_false_positives()) + len(remainder) num_fn = len(test_obj.get_false_negatives()) num_tn = test_obj.get_true_negatives() mcc = calculate_matthews_correlation_coefficient( num_tp, num_fp, num_fn, num_tn) mcc_string += "\t".join( [str(x) for x in [d, mcc, num_tp, num_tn, num_fp, num_fn]]) + "\n" d += 1 logging.info(mcc_string) with open(mcc_file, 'w') as mcc_handler: mcc_handler.write(mcc_string) return