def plot_tree(newick_in_fn, out_plot_fn, attribute_name): tree = pro.load_nhx_tree(newick_in_fn) ts = ete3.TreeStyle() ts.show_leaf_name = False def my_layout(node): name = getattr(node, attribute_name) try: kmer_full = locale.format("%d", int(node.kmers_full), grouping=True), except AttributeError: kmer_full = None try: kmer_reduced = locale.format("%d", int(node.kmers_reduced), grouping=True) except AttributeError: kmer_reduced = None if kmer_full is None: if kmer_reduced is None: t = name else: t = "{} [red. {}]".format(name, kmer_reduced) else: if kmer_reduced is None: t = "{} [full {}]".format(name, kmer_full) else: t = "{} [full {} & red. {}]".format(name, kmer_full, kmer_reduced) f = ete3.TextFace(t, tight_text=True) ete3.add_face_to_node(f, node, column=0, position="branch-right") ts.layout_fn = my_layout tree.render(out_plot_fn, tree_style=ts)
def main(): parser = argparse.ArgumentParser(description='Verify a Newick/NHX tree') parser.add_argument( 'tree', metavar='<tree.nw>', type=str, nargs='+', help='phylogenetic tree (in Newick/NHX)', ) args = parser.parse_args() tree_fns = args.tree ok = True for tree_fn in tree_fns: print("Validating '{}'".format(tree_fn)) tree = pro.load_nhx_tree(tree_fn, validate=False) r = pro.validate_prophyle_nhx_tree(tree, verbose=True, throw_exceptions=False, output_fo=sys.stdout) if r: print(" ...OK") else: ok = False print() sys.exit(0 if ok else 1)
def _test_tree(fn): """Test if given tree is valid for ProPhyle. Args: fn (str): Newick/NHX tree. """ tree = pro.load_nhx_tree(fn, validate=False) if not pro.validate_prophyle_nhx_tree( tree, verbose=True, throw_exceptions=False, output_fo=sys.stderr): error("The tree '{}' could not be properly parsed.".format(fn))
def _test_tree(fn): """Test if given tree is valid for ProPhyle. Args: fn (str): Newick/NHX tree. Raises: AssertionError: The tree is not valid. """ tree = pro.load_nhx_tree(fn, validate=False) assert pro.validate_prophyle_nhx_tree(tree, verbose=True, throw_exceptions=False, output_fo=sys.stderr)
def __init__(self, tree_newick_fn, index_dir, library_dir, makefile_fn): """Init the class. Args: tree_newick_fn (str): Tree file name. index_dir (str): Directory of the index. library_dir (str): Directory with FASTA files. makefile_fn (str): Output Makefile. """ self.tree_newick_fn = tree_newick_fn tree = pro.load_nhx_tree(tree_newick_fn) self.tree = pro.minimal_subtree(tree) self.newick_dir = os.path.dirname(tree_newick_fn) self.index_dir = index_dir self.library_dir = library_dir self.makefile_fn = makefile_fn pro.makedirs(self.index_dir)
def __init__(self, tree_newick_fn, k): tree = pro.load_nhx_tree(tree_newick_fn) self.tree = pro.minimal_subtree(tree) self.k = k self.nodename_to_node = {} self.nodename_to_kmercount = {} self.nodename_to_samannot = {} self.nodename_to_upnodenames = collections.defaultdict(lambda: set()) for node in self.tree.traverse("postorder"): nodename = node.name self.nodename_to_node[nodename] = node self.nodename_to_kmercount[nodename] = int(node.kmers_full) # annotations tags_parts = [] try: tags_parts.append("gi:Z:{}".format(node.gi)) except AttributeError: pass try: tags_parts.append("sn:Z:{}".format(node.sci_name)) except AttributeError: pass try: tags_parts.append("ra:Z:{}".format(node.rank)) except AttributeError: pass self.nodename_to_samannot[nodename] = "\t".join(tags_parts) # set of upper nodes while node.up: node = node.up self.nodename_to_upnodenames[nodename].add(node.name)
def __init__(self, tree_newick_fn): self.tree_newick_fn = tree_newick_fn self.tree = pro.load_nhx_tree(tree_newick_fn)
def prophyle_index( index_dir, threads, k, trees_fn, library_dir, construct_klcp, force, no_prefixes, mask_repeats, keep_tmp_files, sampling_rate, autocomplete ): """Build a ProPhyle index. Args: index_dir (str): Index directory. threads (int): Number of threads in k-mer propagation. k (int): K-mer size. trees_fn (list of str): Newick/NHX tree, possibly with a root spec (@root). library_dir (str): Library directory. klcp (bool): Generate klcp. force (bool): Rewrite files if they already exist. no_prefixes (bool): Don't prepend prefixes to node names during tree merging. mask_repeats (bool): Mask repeats using DustMasker. keep_tmp_files (bool): Keep temporary files from k-mer propagation. sampling rate (float): Sampling rate for subsampling the tree or None for no subsampling. autocomplete (bool): Autocomplete names of internal nodes and fasta paths. """ assert isinstance(k, int) assert isinstance(threads, int) assert k > 1 assert threads > 0 assert sampling_rate is None or 0.0 <= float(sampling_rate) <= 1.0 _compile_prophyle_bin(parallel=True) index_fa = os.path.join(index_dir, 'index.fa') index_tree_1 = os.path.join(index_dir, 'tree.preliminary.nw') index_tree_2 = os.path.join(index_dir, 'tree.nw') # recompute = recompute everything from now on # force==True => start to recompute everything from beginning recompute = force # make index dir pro.makedirs(index_dir) # # 1) Newick # #if not _is_complete(index_dir, 1) or not pro.existing_and_newer_list(trees_fn, index_tree_1): if not _is_complete(index_dir, 1): recompute = True if recompute: pro.message('[1/6] Copying/merging trees', upper=True) for tree_fn in trees_fn: tree_fn, _, root = tree_fn.partition("@") tree = pro.load_nhx_tree(tree_fn, validate=False) # postpone for autocomplete if not autocomplete: pro.validate_prophyle_nhx_tree(tree) if root != "": assert len(tree.search_nodes(name=root)) != 0, "Node '{}' does not exist in '{}'.".format(root, tree_fn) if len(trees_fn) != 1: pro.message('Merging {} trees'.format(len(trees_fn))) _propagation_preprocessing( trees_fn, index_tree_1, no_prefixes=no_prefixes, sampling_rate=sampling_rate, autocomplete=autocomplete ) _test_tree(index_tree_1) _mark_complete(index_dir, 1) else: pro.message('[1/6] Tree already exists, skipping its creation', upper=True) # # 2) Create and run Makefile for propagation, and merge FASTA files # if not _is_complete(index_dir, 2): recompute = True if recompute: pro.message('[2/6] Running k-mer propagation', upper=True) _create_makefile(index_dir, k, library_dir, mask_repeats=mask_repeats) _propagate(index_dir, threads=threads) _propagation_postprocessing(index_dir, index_tree_1, index_tree_2) _test_tree(index_tree_2) _kmer_stats(index_dir) if not keep_tmp_files: _remove_tmp_propagation_files(index_dir) else: pro.message('Keeping temporary files') _mark_complete(index_dir, 2) else: pro.message('[2/6] K-mers have already been propagated, skipping propagation', upper=True) # # 3) BWT # if not _is_complete(index_dir, 3) and not _is_complete(index_dir, 4, dont_check_previous=True): recompute = True if recompute: pro.message('[3/6] Constructing BWT', upper=True) pro.rm(index_fa + '.bwt', index_fa + '.bwt.complete') _fa2pac(index_fa) _pac2bwt(index_fa) _mark_complete(index_dir, 3) else: pro.message('[3/6] BWT already exists, skipping its construction', upper=True) # # 3) OCC # if not _is_complete(index_dir, 4): recompute = True if recompute: pro.message('[4/6] Constructing OCC', upper=True) _bwt2bwtocc(index_fa) _mark_complete(index_dir, 4) else: pro.message('[4/6] OCC already exists, skipping their construction', upper=True) # # 4) SA + 5) KLCP (compute SA + KLCP in parallel) # klcp_fn = "{}.{}.klcp".format(index_fa, k) if construct_klcp: if not _is_complete(index_dir, 5): # SA not computed yet => compute it in parallel with KLCP recompute = True if recompute: pro.message('[5/6],[6/6] Constructing SA + KLCP in parallel ', upper=True) _bwtocc2sa_klcp(index_fa, k) _mark_complete(index_dir, 5) _mark_complete(index_dir, 6) return # # 5) SA (compute only SA) # if not _is_complete(index_dir, 5): recompute = True if recompute: pro.message('[5/6] Constructing SA', upper=True) _bwtocc2sa(index_fa) else: pro.message('[5/6] SA already exists, skipping its construction', upper=True) # # 6) KLCP (compute only KLCP) # if construct_klcp: if not _is_complete(index_dir, 6): recompute = True if recompute: pro.message('[6/6] Constructing k-LCP', upper=True) _bwtocc2klcp(index_fa, k) _mark_complete(index_dir, 6) else: pro.message('[6/6] k-LCP already exists, skipping its construction', upper=True)
def main(): parser = argparse.ArgumentParser( description= 'K-mer propagation postprocessing: merging FASTA files and k-mer annotation.' ) parser.add_argument('dir', metavar='<propagation.dir>', type=str, help='directory with FASTA files') parser.add_argument( 'index_fasta_fn', type=str, metavar='<index.fa>', help='output fast file', ) parser.add_argument( 'in_tree_fn', type=str, metavar='<in.tree.nw>', help='input phylogenetic tree', ) parser.add_argument( 'counts_fn', type=str, metavar='<counts.tsv>', help='input phylogenetic tree', ) parser.add_argument( 'out_tree_fn', type=str, metavar='<out.tree.nw>', help='output phylogenetic tree', ) #parser.add_argument ( # '-D', # dest='nondel', # action='store_true', # help='Non-deleting propagation', #) args = parser.parse_args() dir_fn = args.dir index_fasta_fn = args.index_fasta_fn in_tree_fn = args.in_tree_fn out_tree_fn = args.out_tree_fn tsv_fn = args.counts_fn suffix = "reduced.fa" #if args.nondel: # suffix = "full.fa" #else: # suffix = "reduced.fa" create_fasta(dir_fn, index_fasta_fn, suffix) tree = pro.load_nhx_tree(in_tree_fn) stats = load_kmer_stats(tsv_fn) enrich_tree(tree, stats) pro.save_nhx_tree(tree, out_tree_fn)
def merge_trees(input_trees_fn, output_tree_fn, verbose, add_prefixes, sampling_rate, autocomplete): assert sampling_rate is None or 0.0 <= float(sampling_rate) <= 1.0 t = ete3.Tree(name="merge_root", ) if len(input_trees_fn) == 1: if verbose: print("Only one tree, don't add any prefix", file=sys.stderr) add_prefixes = False for i, x in enumerate(input_trees_fn, 1): if verbose: print("Loading '{}'".format(x), file=sys.stderr) tree_fn, _, root_name = x.partition("@") tree_to_add = pro.load_nhx_tree(tree_fn, validate=False) # subtree extraction required if root_name != '': tree_to_add = tree_to_add & root_name # prepend prefixes to node names if add_prefixes: tree_to_add = add_prefix(tree_to_add, i) t.add_child(tree_to_add) if autocomplete: if not pro.has_attribute(t, "fastapath"): t = autocomplete_fastapath(t) t = autocomplete_internal_node_names(t) if sampling_rate is not None: sampling_rate = float(sampling_rate) leaves_1 = [] for node in t.traverse("postorder"): if len(node.children) == 0: leaves_1.append(node) leaves_1.sort(key=lambda x: x.name) leaves_2 = random.sample(leaves_1, max(round(sampling_rate * len(leaves_1)), 1)) leaves_2.sort(key=lambda x: x.name) leaves_to_remove = list(set(leaves_1) - set(leaves_2)) leaves_to_remove.sort(key=lambda x: x.name) if verbose: print("Removing the following leaves: {}".format(", ".join( map(apply(lambda x: x.name, leaves_to_remove)))), file=sys.stderr) for node in leaves_to_remove: while len(node.up.children) == 1: node = node.up node.detach() print( "Subsampling the tree with rate {:.4f}, {} leaves were kept (out of {})" .format(sampling_rate, len(leaves_2), len(leaves_1)), file=sys.stderr) if verbose: print("Writing to '{}'".format(output_tree_fn), file=sys.stderr) pro.save_nhx_tree(t, output_tree_fn)