示例#1
0
def manage_refpkgs(sys_args):
    args = get_arguments(sys_args)
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    prep_logging(log_file_name=os.path.join(args.output_dir,
                                            "log_refpkg_manager_" + dt.now().strftime("%Y-%m-%d") + ".txt"),
                 verbosity=False)

    refpkg_dirs = gather_refpkg_dirs()
    ref_packages = instantiate_refpkgs(refpkg_dirs)

    # TODO: Differentiate reference packages found in directories made by treesapp update
    if args.list:
        list_refpkgs(ref_packages)
        return
    if args.name:
        filter_refpkgs_by_name(ref_packages, args.name)

    if args.template:
        build_templates(ref_packages, scheduler=args.scheduler, template=args.template)

    if args.update:
        rebuild_gather_reference_packages(ref_packages, args.output_dir)

    if args.copy:
        # Copy reference package pickle files
        if not args.treesapp_install:
            logging.error("TreeSAPP installation directory was not provided. Not sure where to copy the refpkgs!\n")
            return
        copy_refpkg_files_to_treesapp(ref_packages, args.treesapp_install)

    return
示例#2
0
def main():
    args = get_options()
    prep_logging()
    stunted_fa = read_fasta_to_dict(args.old_fasta)
    header_map = read_name_map(args.name_map)
    new_fa = find_replace_headers(stunted_fa, header_map)
    write_new_fasta(new_fa, args.output)
示例#3
0
def main():
    args = get_arguments()

    log_file_name = os.path.abspath("./TreeSAPP_auto-colour_log.txt")
    prep_logging(log_file_name, args.verbose)
    logging.info("\n##\t\t\tGenerating colour-style file for iTOL\t\t\t##\n")
    validate_command(args)

    taxa_colours = init_taxa_colours(args)  # type: TaxaColours
    read_tax_ids_file(taxa_colours)
    target_depth = find_rank_depth(args)
    taxa_colours.get_clades(target_depth)
    if args.taxa_filter:
        taxa_colours.filter_unwanted_taxa(args.taxa_filter, target_depth)
    if args.no_poly:
        # Optionally not colour polyphyletic clades based on args.no_poly
        taxa_colours.filter_polyphyletic_groups()
    if args.min_prop:
        taxa_colours.filter_rare_groups(args.min_prop)
    leaf_order = linearize_tree_leaves(args.tree)
    colours = get_colours(args, taxa_colours, args.palette)
    # Sort the nodes by their internal node order
    taxa_order = order_taxa(taxa_colours.taxon_leaf_map, leaf_order)
    palette_taxa_map = map_colours_to_taxa(taxa_order, colours)
    write_colours_styles(taxa_colours, palette_taxa_map)
    # Find the minimum set of monophyletic internal nodes for each taxon
    taxa_clades = taxa_colours.find_mono_clades()
    taxa_ranges = convert_clades_to_ranges(taxa_clades, leaf_order)
    write_colour_strip(taxa_ranges, palette_taxa_map,
                       taxa_colours.strip_output)
示例#4
0
def main():
    prep_logging(log_file_name=os.path.join(
        os.getcwd(), "refpkgs_" + dt.now().strftime("%Y-%m-%d") + "_log.txt"),
                 verbosity=False)

    args = get_arguments()

    refpkg_dirs = gather_refpkg_dirs()
    ref_packages = instantiate_refpkgs(refpkg_dirs)

    # TODO: Differentiate reference packages found in directories made by treesapp update
    if args.list:
        list_refpkgs(ref_packages)
        return
    if args.name:
        filter_refpkgs_by_name(ref_packages, args.name)

    if args.template:
        build_templates(ref_packages, args.template)

    if args.update:
        rebuild_gather_reference_packages(ref_packages)

    if args.copy:
        # Copy reference package pickle files
        if not args.treesapp_install:
            logging.error(
                "TreeSAPP installation directory was not provided. Not sure where to copy the refpkgs!\n"
            )
            return
        copy_refpkg_files_to_treesapp(ref_packages, args.treesapp_install)

    return
def main():
    args = get_arguments()
    prep_logging()
    uc_dict = read_uc(args.uc)
    ref_seq_names = read_seq_names_list(args.seq_names)
    equivalogs = get_equivalogs(uc_dict, ref_seq_names)
    write_equivalogs(equivalogs, args.output)
    return
示例#6
0
def main():
    args = get_options()
    log_file_name = "Efetch_fasta_log.txt"
    prep_logging(log_file_name, args.verbose)
    if args.format == "stockholm":
        accessions = read_stockholm(args)
    elif args.format == "fasta":
        accessions = set()
        input_headers = get_headers(args.input)
        for header in input_headers:
            header_format_re, header_db, header_molecule = get_header_format(
                header)
            sequence_info = header_format_re.match(header)
            seq_info_tuple = return_sequence_info_groups(
                sequence_info, header_db, header)
            accessions.add(seq_info_tuple.accession)
        logging.debug(
            str(len(input_headers) - len(accessions)) +
            " duplicate accessions found in " + args.input + "\n")
    elif args.format == "list":
        accessions = read_accession_list(args)
    else:
        logging.error("Unrecognized file format '" + args.format + "'.\n")
        sys.exit(11)

    if len(accessions) == 0:
        logging.error("No accessions were read from '" + args.input + "'.\n")
        sys.exit(11)

    seq_record_list = fetch_sequences(args, accessions)

    # Generate a fasta dictionary from each of the seq_record objects
    fasta_dict = dict()
    failures = list()
    for seq_record in seq_record_list.values():
        seq_dict = seq_record.fastafy(args.seq_out)
        if seq_dict:
            fasta_dict.update(seq_dict)
        else:
            failures.append(seq_record.get_desired_accession(args.molecule_in))

    if failures:
        logging.info("Unable to fetch information from NCBI for " +
                     str(len(failures)) + '/' + str(len(accessions)) + ":\n" +
                     '\n'.join(failures) + "\n")

    write_fasta(args, fasta_dict)
def main():
    args = get_arguments()
    prep_logging()
    sto_dict = read_stockholm(args.sto_file)
    write_sto_table(sto_dict, args.tbl_out)
    return
示例#8
0
def mcc_calculator():
    args = get_arguments()
    log_name = args.output + os.sep + "MCC_log.txt"
    mcc_file = args.output + os.sep + "MCC_table.tsv"
    summary_rank = "Phylum"
    taxa_dist_output = args.output + '.'.join(
        os.path.basename(
            args.input).split('.')[:-1]) + '_' + summary_rank + "_dist.tsv"
    classification_info_output = args.output + '.'.join(
        os.path.basename(args.input).split('.')[:-1]) + "_classifications.tsv"
    prep_logging(log_name, args.verbose)
    logging.info(
        "\n##\t\t\tBeginning Matthews Correlation Coefficient analysis\t\t\t##\n"
    )
    validate_command(args, sys.argv)

    ##
    # Read the file mapping reference package name to the database annotations
    ##
    pkg_name_dict = read_annotation_mapping_file(args.annot_map)
    marker_build_dict = file_parsers.parse_ref_build_params(args.treesapp, [])
    test_obj = ConfusionTest(pkg_name_dict.keys())
    test_obj.map_data(output_dir=args.output, tool=args.tool)
    if args.overwrite and os.path.exists(test_obj.data_dir):
        shutil.rmtree(test_obj.data_dir)

    ##
    # Load the taxonomic trie for each reference package
    ##
    if args.tool == "treesapp":
        for pkg_name in test_obj.ref_packages:
            refpkg = test_obj.ref_packages[pkg_name]
            marker = marker_build_dict[pkg_name].cog
            refpkg.prefix = marker
            refpkg.gather_package_files(args.pkg_path)
            test_obj.ref_packages[
                pkg_name].taxa_trie = all_possible_assignments(
                    test_obj.ref_packages[pkg_name].lineage_ids)
    else:
        for gpkg in glob(args.gpkg_dir + "*gpkg"):
            marker = str(os.path.basename(gpkg).split('.')[0])
            pkg_name = fish_refpkg_from_build_params(
                marker, marker_build_dict).denominator
            if pkg_name in pkg_name_dict:
                try:
                    tax_ids_file = glob(
                        os.sep.join([
                            gpkg, marker + ".gpkg.refpkg",
                            marker + "*_taxonomy.csv"
                        ])).pop()
                    test_obj.ref_packages[
                        pkg_name].taxa_trie = grab_graftm_taxa(tax_ids_file)
                except IndexError:
                    logging.warning("No GraftM taxonomy file found for " +
                                    marker + ". Is this refpkg incomplete?\n")

    ##
    # Run the specified taxonomic analysis tool and collect the classifications
    ##
    assignments = {}
    test_fa_prefix = '.'.join(os.path.basename(args.input).split('.')[:-1])
    if args.tool == "treesapp":
        ref_pkgs = ','.join(pkg_name_dict.keys())
        classification_table = os.sep.join([
            args.output, "TreeSAPP_output", "final_outputs",
            "marker_contig_map.tsv"
        ])
        if not os.path.isfile(classification_table):
            classify_args = [
                "-i", args.input, "-t", ref_pkgs, "-n",
                str(args.num_threads), "-m", "prot", "--output",
                test_obj.data_dir, "--trim_align", "--overwrite"
            ]
            assign(classify_args)
        classification_lines = file_parsers.read_marker_classification_table(
            classification_table)
        assignments = file_parsers.parse_assignments(classification_lines)
    else:
        # Since you are only able to analyze a single reference package at a time with GraftM, this is ran iteratively
        for gpkg in glob(args.gpkg_dir + "*gpkg"):
            marker = str(os.path.basename(gpkg).split('.')[0])
            if not marker:
                logging.error("Unable to parse marker name from gpkg '" +
                              gpkg + "'\n")
                sys.exit(5)
            pkg_name = fish_refpkg_from_build_params(
                marker, marker_build_dict).denominator
            if pkg_name not in pkg_name_dict:
                logging.warning("'" + pkg_name + "' not in " + args.annot_map +
                                " and will be skipped...\n")
                continue
            output_dir = test_obj.data_dir + pkg_name + os.sep
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)
            classification_table = output_dir + test_fa_prefix + os.sep + test_fa_prefix + "_read_tax.tsv"
            if not os.path.isfile(classification_table):
                classify_call = [
                    "graftM", "graft", "--forward", args.input,
                    "--graftm_package", gpkg, "--input_sequence_type",
                    "aminoacid", "--threads",
                    str(args.num_threads), "--output_directory", output_dir,
                    "--force"
                ]
                if args.tool == "diamond":
                    classify_call += ["--assignment_method", "diamond"]
                    classify_call += ["--search_method", "diamond"]
                launch_write_command(classify_call, False)

            assignments[marker] = file_parsers.read_graftm_classifications(
                classification_table)

    if len(assignments) == 0:
        logging.error("No sequences were classified by " + args.tool + ".\n")
        sys.exit(3)

    logging.info("Reading headers in " + args.input + "... ")
    test_seq_names = [
        seq_name[1:] if seq_name[0] == '>' else seq_name
        for seq_name in get_headers(args.input)
    ]
    logging.info("done.\n")
    test_obj.num_total_queries = len(test_seq_names)
    eggnog_re = re.compile(r"^>?(COG[A-Z0-9]+|ENOG[A-Z0-9]+)_(\d+)\..*")
    test_obj.header_regex = eggnog_re

    ##
    # Bin the test sequence names into their respective confusion categories (TP, TN, FP, FN)
    ##
    test_obj.bin_headers(test_seq_names, assignments, pkg_name_dict,
                         marker_build_dict)
    test_seq_names.clear()

    ##
    # Parse the taxonomic IDs from EggNOG headers and map taxonomic lineage information to classified sequences
    ##
    _TAXID_GROUP = 2
    test_obj.retrieve_lineages(_TAXID_GROUP)
    test_obj.map_lineages()

    test_obj.bin_true_positives_by_taxdist()
    test_obj.validate_false_positives()
    test_obj.validate_false_negatives(pkg_name_dict)

    test_obj.summarise_reference_taxa(taxa_dist_output,
                                      classification_info_output, summary_rank)
    logging.debug(test_obj.summarize_type_two_taxa(summary_rank))
    logging.debug(test_obj.true_positive_taxonomic_summary(summary_rank, True))

    ##
    # Report the MCC score across different taxonomic distances - should increase with greater allowed distance
    ##
    test_obj._MAX_TAX_DIST = 6
    logging.debug(test_obj.get_info(True))
    d = 0
    mcc_string = "Tax.dist\tMCC\tTrue.Pos\tTrue.Neg\tFalse.Pos\tFalse.Neg\n"
    while d < 8:
        test_obj._MAX_TAX_DIST = d
        tp, remainder = test_obj.get_true_positives_at_dist()
        num_tp = len(tp)
        num_fp = len(test_obj.get_false_positives()) + len(remainder)
        num_fn = len(test_obj.get_false_negatives())
        num_tn = test_obj.get_true_negatives()
        mcc = calculate_matthews_correlation_coefficient(
            num_tp, num_fp, num_fn, num_tn)
        mcc_string += "\t".join(
            [str(x) for x in [d, mcc, num_tp, num_tn, num_fp, num_fn]]) + "\n"
        d += 1
    logging.info(mcc_string)
    with open(mcc_file, 'w') as mcc_handler:
        mcc_handler.write(mcc_string)
    return