def para( factor_by_capture, factor_by_field, factor_by_table, keep, keep_regex, default, min_tips, proportion, scale, number, seed, newick, zero, tree, ): """ Paraphyletic sampling. The sampling algorithm starts at the root and descends to the tips. At each node, we store monophyletic subtrees in a list and descend into polyphyletic ones (whose leaves have multiple factors). If we reach a tip or encounter a monophyletic child of a different factor than the stored subtrees, then we stop and sample from all tips in the stored trees and initialize a new list with the new monophyletic child. """ import smot.algorithm as alg if not (proportion or scale or number): die("Please add either a --proportion or --scale or --number option") tree = read_tree(tree) tree.tree = factorTree( node=tree.tree, factor_by_capture=factor_by_capture, factor_by_field=factor_by_field, factor_by_table=factor_by_table, default=default, ) tree.tree = alg.sampleParaphyletic( tree.tree, keep=keep, keep_regex=keep_regex, proportion=proportion, scale=scale, number=number, minTips=min_tips, seed=seed, ) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))
def prop( factor_by_capture, factor_by_field, factor_by_table, keep, keep_regex, default, min_tips, proportion, scale, number, seed, newick, zero, tree, ): """ Proportional sampling. Randomly sample p (0 to 1, from --proportion) tips from each monophyletic (relative to factors) subtree. Retain at least N tips in each branch (--min-tips). """ import smot.algorithm as alg if not (proportion or scale or number): die("Please add either a --proportion or --scale or --number option") tree = read_tree(tree) tree.tree = factorTree( node=tree.tree, factor_by_capture=factor_by_capture, factor_by_field=factor_by_field, factor_by_table=factor_by_table, default=default, ) tree.tree = alg.sampleProportional( tree.tree, keep=keep, keep_regex=keep_regex, proportion=proportion, scale=scale, number=number, minTips=min_tips, seed=seed, ) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))
def colorBranches(is_para, factor_by_capture, factor_by_field, factor_by_table, colormap, tree): import smot.algorithm as alg tree = read_tree(tree) tree.tree = factorTree( node=tree.tree, factor_by_capture=factor_by_capture, factor_by_field=factor_by_field, factor_by_table=factor_by_table, ) tree.tree = alg.setFactorCounts(tree.tree) factors = sorted(list(tree.tree.data.factorCount.keys())) _colormap = dict() if colormap: with open(colormap, "r") as f: try: _colormap = { f.strip(): c.strip().upper() for (f, c) in [p.strip().split("\t") for p in f.readlines()] } for clade, color in _colormap.items(): if color[0] != "#": _colormap[clade] = "#" + color if len(color) != 7: die('Expected colors in hexadecimal (e.g., "#AA10FF")') except ValueError: die("Invalid color map: expected TAB-delimited, two-column file" ) else: _colormap = chooseColorScheme(factors) if is_para: tree.tree = alg.colorPara(tree.tree, colormap=_colormap) else: tree.tree = alg.colorMono(tree.tree, colormap=_colormap) print(sf.nexus(tree))
def rm_color(newick, tree): """ Remove all color annotations from a tree """ import smot.algorithm as alg tree = read_tree(tree) tree.colmap = dict() def _fun(d): if d.form and "!color" in d.form: del d.form["!color"] return d tree.tree = alg.treemap(tree.tree, _fun) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))
def equal( factor_by_capture, factor_by_field, factor_by_table, keep, default, max_tips, zero, newick, tree, ): """ Equal sampling. Descend from root to tip. At each node, determine if each subtree contains a single factor. If a subtree is not monophyletic, recurse into the subtree. If the subtree is monophyletic, then select up to N tips (from the --max-tips argument) from the subtree. The selection of tips is deterministic but dependent on the ordering of leaves. To sample a subtree, an equal number of tips is sampled from each descendent subtree, and so on recursively down to the tips. The resulting downsampled subtree captures the depth of the tree, but is not representative of the tree's breadth. That is, if N=6 and a tree splits into two subtrees, one with 3 tips and one with 300 tips, still 3 tips will be sampled from each branch. """ import smot.algorithm as alg tree = read_tree(tree) tree.tree = factorTree( node=tree.tree, factor_by_capture=factor_by_capture, factor_by_field=factor_by_field, factor_by_table=factor_by_table, default=default, ) tree.tree = alg.sampleContext(tree.tree, keep=keep, maxTips=max_tips) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))
def tipsed(pattern, replacement, newick, tree): """ Search and replace patterns in tip labels. """ import smot.algorithm as alg import re pat = re.compile(pattern) def fun_(nodeData): if nodeData.label: nodeData.label = re.sub(pat, replacement, nodeData.label) return nodeData tree = read_tree(tree) tree.tree = alg.treemap(tree.tree, fun_) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))
def grep(pattern, tree, invert_match, perl, newick, file): """ Prune a tree to preserve only the tips with that match a pattern. """ import smot.algorithm as alg import re if file: with open(pattern, "r") as f: patterns = [p.strip() for p in f.readlines()] matcher = lambda s: any([p in s for p in patterns]) elif perl: regex = re.compile(pattern) if invert_match: matcher = lambda s: not re.search(regex, s) else: matcher = lambda s: re.search(regex, s) else: if invert_match: matcher = lambda s: pattern not in s else: matcher = lambda s: pattern in s def fun_(node): return [ kid for kid in node.kids if (not kid.data.isLeaf or matcher(kid.data.label)) ] tree = read_tree(tree) tree.tree = alg.clean(alg.treecut(tree.tree, fun_)) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))
def leaf(pattern, perl, tree): """ Color the tips on a tree. smot color -p "swine" "#FFA500" -p "2020-" "#00FF00" my.tre > color.tre """ import smot.algorithm as alg import re tree = read_tree(tree) tips = alg.tips(tree.tree) for (pat_str, col) in pattern: if perl: pat = re.compile(pat_str) matcher = lambda x: re.search(pat, x) else: matcher = lambda x: pat_str in x for tip in tips: if matcher(tip): tree.colmap[tip] = col print(sf.nexus(tree))
def filter_cmd( # conditions all_match, some_match, none_match, larger_than, smaller_than, # actions remove, color, sample, replace, # factor methods factor_by_capture, factor_by_field, factor_by_table, default, # phylogenetic options patristic, seed, # boilerplate newick, tree, ): """ An advanced tool for performaing actions (remove, color, sample, or replace) on monophyletic groups that meet specified conditions (all-match, some-match, etc. """ import smot.algorithm as alg import re tree = read_tree(tree) tree.tree = factorTree( node=tree.tree, factor_by_capture=factor_by_capture, factor_by_field=factor_by_field, factor_by_table=factor_by_table, default=default, patristic=patristic, ) def condition(node): tips = alg.tips(node) return ((not larger_than or len(tips) > larger_than) and (not smaller_than or len(tips) < smaller_than) and (not all_match or all([ all([re.search(pat, tip) for tip in tips]) for pat in all_match ])) and (not some_match or all([ any([re.search(pat, tip) for tip in tips]) for pat in some_match ])) and (not none_match or all([ all([not re.search(pat, tip) for tip in tips]) for pat in none_match ]))) if remove: action = lambda x: None elif color: action = lambda x: alg.colorTree(x, color) elif sample: action = lambda x: alg.sampleProportional(x, proportion=sample, scale=None, minTips=3, keep_regex="", seed=seed) elif replace: def _fun(d): d.label = re.sub(replace[0], replace[1], d.label) return d action = lambda x: alg.treemap(x, _fun) tree.tree = alg.filterMono(tree.tree, condition=condition, action=action) tree.tree = alg.clean(tree.tree) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))
def factor( method, factor_by_capture, factor_by_field, factor_by_table, default, impute, patristic, newick, tree, ): """ Impute, annotate with, and/or tabulate factors. The --impute option will fill in missing factors in monophyletic branches. This is useful, for example, for inferring clades given a few references in a tree. There are three modes: 'table' prints a TAB-delimited table of tip names and factors, 'prepend' adds the factor to the beginning of the tiplabel (delimited with '|'), 'append' adds it to the end. """ import smot.algorithm as alg tree = read_tree(tree) tree.tree = factorTree( node=tree.tree, factor_by_capture=factor_by_capture, factor_by_field=factor_by_field, factor_by_table=factor_by_table, default=default, impute=impute, patristic=patristic, ) # create TAB-delimited, table with columns for the tip labels and the # (possibly imputed) factor if method.lower() == "table": def _fun(b, x): if x.isLeaf: if x.factor is None: factor = default else: factor = x.factor b.append(f"{x.label}\t{factor}") return b for row in alg.treefold(tree.tree, _fun, []): print(row) # prepend or append the factor to the tip labels and print the resulting tree else: def _fun(x): if x.isLeaf: if x.factor is None: x.factor = default if method.lower() == "prepend": x.label = f"{x.factor}|{x.label}" else: x.label = f"{x.label}|{x.factor}" return x tree.tree = alg.treemap(tree.tree, _fun) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))