def test_string_complex(self): s = """("that's cool"[&!color=#000000]:0.3);""" self.assertEqual(newick(sp.p_tree.parse(s)), s) s = """(A:3);""" self.assertEqual(newick(sp.p_tree.parse(s)), s) s = """('that"s !@#$%^&)(*&^[]cool'[&!color=#000000]:0.3);""" self.assertEqual(newick(sp.p_tree.parse(s)), s)
def test_sampleParaphyletic(self): fork = "(X1|H,(X2|H,(X3|H,(X4|H,((Y1|H,(Y2|H,(Y3|H,(Y4|H,Y5|H)))),X6|S)))));" forkFac = alg.factorByField(sp.p_tree.parse(fork).tree, field=2) self.assertEqual( newick( alg.sampleParaphyletic(forkFac, proportion=0.3, keep=["S"], minTips=2, seed=42)), "(X1|H,(X4|H,((Y2|H,Y3|H),X6|S)));", ) self.assertEqual( newick( alg.sampleParaphyletic(sp.p_tree.parse(fork).tree, number=2, seed=46)), "(X2|H,Y2|H);", ) nine = "(Y|x,(U|x,(I|x,(((A|y,B|y),C|y),(D|z,(E|z,F|z))))));" self.assertEqual( newick( alg.sampleParaphyletic(sp.p_tree.parse(nine).tree, number=1, seed=43)), "(A|y);", ) self.assertEqual( newick( alg.sampleParaphyletic( alg.factorByField(sp.p_tree.parse(nine).tree, field=2), number=1, seed=43, )), "(I|x,(B|y,F|z));", ) self.assertEqual( newick( alg.sampleParaphyletic( alg.factorByField(sp.p_tree.parse(nine).tree, field=2), number=2, seed=43, )), "(U|x,(I|x,((A|y,C|y),(E|z,F|z))));", )
def para( factor_by_capture, factor_by_field, factor_by_table, keep, keep_regex, default, min_tips, proportion, scale, number, seed, newick, zero, tree, ): """ Paraphyletic sampling. The sampling algorithm starts at the root and descends to the tips. At each node, we store monophyletic subtrees in a list and descend into polyphyletic ones (whose leaves have multiple factors). If we reach a tip or encounter a monophyletic child of a different factor than the stored subtrees, then we stop and sample from all tips in the stored trees and initialize a new list with the new monophyletic child. """ import smot.algorithm as alg if not (proportion or scale or number): die("Please add either a --proportion or --scale or --number option") tree = read_tree(tree) tree.tree = factorTree( node=tree.tree, factor_by_capture=factor_by_capture, factor_by_field=factor_by_field, factor_by_table=factor_by_table, default=default, ) tree.tree = alg.sampleParaphyletic( tree.tree, keep=keep, keep_regex=keep_regex, proportion=proportion, scale=scale, number=number, minTips=min_tips, seed=seed, ) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))
def prop( factor_by_capture, factor_by_field, factor_by_table, keep, keep_regex, default, min_tips, proportion, scale, number, seed, newick, zero, tree, ): """ Proportional sampling. Randomly sample p (0 to 1, from --proportion) tips from each monophyletic (relative to factors) subtree. Retain at least N tips in each branch (--min-tips). """ import smot.algorithm as alg if not (proportion or scale or number): die("Please add either a --proportion or --scale or --number option") tree = read_tree(tree) tree.tree = factorTree( node=tree.tree, factor_by_capture=factor_by_capture, factor_by_field=factor_by_field, factor_by_table=factor_by_table, default=default, ) tree.tree = alg.sampleProportional( tree.tree, keep=keep, keep_regex=keep_regex, proportion=proportion, scale=scale, number=number, minTips=min_tips, seed=seed, ) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))
def rm_color(newick, tree): """ Remove all color annotations from a tree """ import smot.algorithm as alg tree = read_tree(tree) tree.colmap = dict() def _fun(d): if d.form and "!color" in d.form: del d.form["!color"] return d tree.tree = alg.treemap(tree.tree, _fun) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))
def equal( factor_by_capture, factor_by_field, factor_by_table, keep, default, max_tips, zero, newick, tree, ): """ Equal sampling. Descend from root to tip. At each node, determine if each subtree contains a single factor. If a subtree is not monophyletic, recurse into the subtree. If the subtree is monophyletic, then select up to N tips (from the --max-tips argument) from the subtree. The selection of tips is deterministic but dependent on the ordering of leaves. To sample a subtree, an equal number of tips is sampled from each descendent subtree, and so on recursively down to the tips. The resulting downsampled subtree captures the depth of the tree, but is not representative of the tree's breadth. That is, if N=6 and a tree splits into two subtrees, one with 3 tips and one with 300 tips, still 3 tips will be sampled from each branch. """ import smot.algorithm as alg tree = read_tree(tree) tree.tree = factorTree( node=tree.tree, factor_by_capture=factor_by_capture, factor_by_field=factor_by_field, factor_by_table=factor_by_table, default=default, ) tree.tree = alg.sampleContext(tree.tree, keep=keep, maxTips=max_tips) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))
def tipsed(pattern, replacement, newick, tree): """ Search and replace patterns in tip labels. """ import smot.algorithm as alg import re pat = re.compile(pattern) def fun_(nodeData): if nodeData.label: nodeData.label = re.sub(pat, replacement, nodeData.label) return nodeData tree = read_tree(tree) tree.tree = alg.treemap(tree.tree, fun_) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))
def grep(pattern, tree, invert_match, perl, newick, file): """ Prune a tree to preserve only the tips with that match a pattern. """ import smot.algorithm as alg import re if file: with open(pattern, "r") as f: patterns = [p.strip() for p in f.readlines()] matcher = lambda s: any([p in s for p in patterns]) elif perl: regex = re.compile(pattern) if invert_match: matcher = lambda s: not re.search(regex, s) else: matcher = lambda s: re.search(regex, s) else: if invert_match: matcher = lambda s: pattern not in s else: matcher = lambda s: pattern in s def fun_(node): return [ kid for kid in node.kids if (not kid.data.isLeaf or matcher(kid.data.label)) ] tree = read_tree(tree) tree.tree = alg.clean(alg.treecut(tree.tree, fun_)) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))
def test_sampleN(self): self.assertEqual( newick(alg.sampleN(sp.p_tree.parse("(B,(A,C,E),D);").tree, 2)), "(B,A);")
def test_sampleProportional(self): six = "(((A,B),C),(D,(E,F)));" # sampling is across root children self.assertEqual( newick( alg.sampleProportional(sp.p_tree.parse(six).tree, proportion=0.1, minTips=2, seed=43)), "(A,C);", ) seven = "(O|x,(((A|y,B|y),C|y),(D|z,(E|z,F|z))));" self.assertEqual( newick( alg.sampleProportional( alg.factorByField(sp.p_tree.parse(seven).tree, field=2), proportion=0.1, minTips=2, seed=46, )), "(O|x,((A|y,B|y),(D|z,F|z)));", ) # --- selection by number works for unfactored trees # sometimes a basal strain is selected self.assertEqual( newick( alg.sampleProportional(sp.p_tree.parse(seven).tree, number=1, seed=46)), "(O|x);", ) # sometimes it isn't (random) self.assertEqual( newick( alg.sampleProportional(sp.p_tree.parse(seven).tree, number=1, seed=44)), "(C|y);", ) # sometimes both root branches will be sampled self.assertEqual( newick( alg.sampleProportional(sp.p_tree.parse(seven).tree, number=3, seed=46)), "(O|x,(C|y,F|z));", ) # sometimes they won't self.assertEqual( newick( alg.sampleProportional(sp.p_tree.parse(seven).tree, number=3, seed=40)), "(C|y,(D|z,E|z));", ) # --- selection by number works for factored trees self.assertEqual( newick( alg.sampleProportional( alg.factorByField(sp.p_tree.parse(seven).tree, field=2), number=1, seed=43, )), "(O|x,(A|y,E|z));", ) self.assertEqual( newick( alg.sampleProportional( alg.factorByField(sp.p_tree.parse(seven).tree, field=2), number=2, seed=43, )), "(O|x,((A|y,B|y),(D|z,F|z)));", ) # --- high numbers cleanly select everything self.assertEqual( newick( alg.sampleProportional(alg.factorByField( sp.p_tree.parse(seven).tree, field=2), number=100)), seven, ) self.assertEqual( newick( alg.sampleProportional(alg.factorByField( sp.p_tree.parse(seven).tree, field=2), number=100)), seven, )
def test_stringify(self): s = "(B|a,(A|b,C|b,E|b),D|c);" self.assertEqual(newick(sp.p_tree.parse(s)), s)
def test_nexus(self): self.assertEqual( sp.p_nexus_tree_line.parse( "\ttree tree_1 = [&R] (B,(A,C,E),D);\n"), sp.p_tree.parse("(B,(A,C,E),D);").tree, ) taxa_block = "\n".join([ "\tdimensions ntax=3", "\ttaxlabels", "\t'A'", "\t'B'[&!color=#999999]", "\t'C'", "\t'D'", "\t'E'", ";", ]) self.assertEqual(sp.p_taxa_block.parse(taxa_block), dict(B="#999999")) taxa_section = "\n".join([ "begin taxa;", "\tdimensions ntax=3", "\ttaxlabels", "\t'A'", "\t'B'[&!color=#999999]", "\t'C'", "\t'D'", "\t'E'", ";", "end;" "", ]) self.assertEqual(sp.p_nexus_section.parse(taxa_section), ("taxa", dict(B="#999999"))) tree_section = "\n".join([ "begin trees;", "\ttree tree_1 = [&R] (B,(A,C,E),D);", "end;", ]) self.assertEqual(newick(sp.p_nexus_section.parse(tree_section)[1]), "(B,(A,C,E),D);") nexus_file = "\n".join([ "#NEXUS", "begin taxa;", "\tdimensions ntax=3", "\ttaxlabels", "\t'A'", "\t'B'", "\t'C'", "\t'D'", "\t'E'", ";", "end;", "", "begin trees;", "\ttree tree_1 = [&R] (B,(A,C,E),D);", "end;", "", ]) self.assertEqual( sp.p_nexus.parse(nexus_file).tree, sp.p_tree.parse("(B,(A,C,E),D);").tree) self.assertEqual( sp.p_tree.parse(nexus_file).tree, sp.p_tree.parse("(B,(A,C,E),D);").tree) big_nexus_file = "\n".join([ """#NEXUS""", """begin taxa;""", """ dimensions ntax=6;""", """ taxlabels""", """ 'X1|H'[&!color=#ff0000]""", """ 'X2|H'""", """ 'X3|H'""", """ 'X4|H'""", """ 'X5|H'""", """ 'X6|S'""", """;""", """end;""", """""", """begin trees;""", """ tree tree_1 = [&R] ('X1|H':0.3,('X2|H':0.3,('X3|H':0.3,('X4|H':0.3,('X5|H':0.3,'X6|S':0.3):0.3):0.3):0.3):0.3);""", """end;""", """""", """begin figtree;""", """ set appearance.backgroundColorAttribute="Default";""", """ set appearance.backgroundColour=#ffffff;""", """ set appearance.branchColorAttribute="User selection";""", """ set appearance.branchColorGradient=false;""", """ set appearance.branchLineWidth=1.0;""", """ set appearance.branchMinLineWidth=0.0;""", """ set appearance.branchWidthAttribute="Fixed";""", """ set appearance.foregroundColour=#000000;""", """ set appearance.hilightingGradient=false;""", """ set appearance.selectionColour=#2d3680;""", """ set branchLabels.colorAttribute="User selection";""", """ set branchLabels.displayAttribute="Branch times";""", """ set branchLabels.fontName="Al Bayan";""", """ set branchLabels.fontSize=8;""", """ set branchLabels.fontStyle=0;""", """ set branchLabels.isShown=false;""", """ set branchLabels.significantDigits=4;""", """ set layout.expansion=0;""", """ set layout.layoutType="RECTILINEAR";""", """ set layout.zoom=0;""", """ set legend.attribute=null;""", """ set legend.fontSize=10.0;""", """ set legend.isShown=false;""", """ set legend.significantDigits=4;""", """ set nodeBars.barWidth=4.0;""", """ set nodeBars.displayAttribute=null;""", """ set nodeBars.isShown=false;""", """ set nodeLabels.colorAttribute="User selection";""", """ set nodeLabels.displayAttribute="Node ages";""", """ set nodeLabels.fontName="Al Bayan";""", """ set nodeLabels.fontSize=8;""", """ set nodeLabels.fontStyle=0;""", """ set nodeLabels.isShown=false;""", """ set nodeLabels.significantDigits=4;""", """ set nodeShape.colourAttribute=null;""", """ set nodeShape.isShown=false;""", """ set nodeShape.minSize=10.0;""", """ set nodeShape.scaleType=Width;""", """ set nodeShape.shapeType=Circle;""", """ set nodeShape.size=4.0;""", """ set nodeShape.sizeAttribute=null;""", """ set polarLayout.alignTipLabels=false;""", """ set polarLayout.angularRange=0;""", """ set polarLayout.rootAngle=0;""", """ set polarLayout.rootLength=100;""", """ set polarLayout.showRoot=true;""", """ set radialLayout.spread=0.0;""", """ set rectilinearLayout.alignTipLabels=false;""", """ set rectilinearLayout.curvature=0;""", """ set rectilinearLayout.rootLength=100;""", """ set scale.offsetAge=0.0;""", """ set scale.rootAge=1.0;""", """ set scale.scaleFactor=1.0;""", """ set scale.scaleRoot=false;""", """ set scaleAxis.automaticScale=true;""", """ set scaleAxis.fontSize=8.0;""", """ set scaleAxis.isShown=false;""", """ set scaleAxis.lineWidth=1.0;""", """ set scaleAxis.majorTicks=1.0;""", """ set scaleAxis.origin=0.0;""", """ set scaleAxis.reverseAxis=false;""", """ set scaleAxis.showGrid=true;""", """ set scaleBar.automaticScale=true;""", """ set scaleBar.fontSize=10.0;""", """ set scaleBar.isShown=true;""", """ set scaleBar.lineWidth=1.0;""", """ set scaleBar.scaleRange=0.0;""", """ set tipLabels.colorAttribute="User selection";""", """ set tipLabels.displayAttribute="Names";""", """ set tipLabels.fontName="Al Bayan";""", """ set tipLabels.fontSize=8;""", """ set tipLabels.fontStyle=0;""", """ set tipLabels.isShown=true;""", """ set tipLabels.significantDigits=4;""", """ set trees.order=false;""", """ set trees.orderType="increasing";""", """ set trees.rooting=false;""", """ set trees.rootingType="User Selection";""", """ set trees.transform=false;""", """ set trees.transformType="cladogram";""", """end;""", ]) self.assertEqual( sp.p_tree.parse(big_nexus_file).tree, sp.p_tree.parse( "('X1|H':0.3,('X2|H':0.3,('X3|H':0.3,('X4|H':0.3,('X5|H':0.3,'X6|S':0.3):0.3):0.3):0.3):0.3);" ).tree, )
def filter_cmd( # conditions all_match, some_match, none_match, larger_than, smaller_than, # actions remove, color, sample, replace, # factor methods factor_by_capture, factor_by_field, factor_by_table, default, # phylogenetic options patristic, seed, # boilerplate newick, tree, ): """ An advanced tool for performaing actions (remove, color, sample, or replace) on monophyletic groups that meet specified conditions (all-match, some-match, etc. """ import smot.algorithm as alg import re tree = read_tree(tree) tree.tree = factorTree( node=tree.tree, factor_by_capture=factor_by_capture, factor_by_field=factor_by_field, factor_by_table=factor_by_table, default=default, patristic=patristic, ) def condition(node): tips = alg.tips(node) return ((not larger_than or len(tips) > larger_than) and (not smaller_than or len(tips) < smaller_than) and (not all_match or all([ all([re.search(pat, tip) for tip in tips]) for pat in all_match ])) and (not some_match or all([ any([re.search(pat, tip) for tip in tips]) for pat in some_match ])) and (not none_match or all([ all([not re.search(pat, tip) for tip in tips]) for pat in none_match ]))) if remove: action = lambda x: None elif color: action = lambda x: alg.colorTree(x, color) elif sample: action = lambda x: alg.sampleProportional(x, proportion=sample, scale=None, minTips=3, keep_regex="", seed=seed) elif replace: def _fun(d): d.label = re.sub(replace[0], replace[1], d.label) return d action = lambda x: alg.treemap(x, _fun) tree.tree = alg.filterMono(tree.tree, condition=condition, action=action) tree.tree = alg.clean(tree.tree) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))
def factor( method, factor_by_capture, factor_by_field, factor_by_table, default, impute, patristic, newick, tree, ): """ Impute, annotate with, and/or tabulate factors. The --impute option will fill in missing factors in monophyletic branches. This is useful, for example, for inferring clades given a few references in a tree. There are three modes: 'table' prints a TAB-delimited table of tip names and factors, 'prepend' adds the factor to the beginning of the tiplabel (delimited with '|'), 'append' adds it to the end. """ import smot.algorithm as alg tree = read_tree(tree) tree.tree = factorTree( node=tree.tree, factor_by_capture=factor_by_capture, factor_by_field=factor_by_field, factor_by_table=factor_by_table, default=default, impute=impute, patristic=patristic, ) # create TAB-delimited, table with columns for the tip labels and the # (possibly imputed) factor if method.lower() == "table": def _fun(b, x): if x.isLeaf: if x.factor is None: factor = default else: factor = x.factor b.append(f"{x.label}\t{factor}") return b for row in alg.treefold(tree.tree, _fun, []): print(row) # prepend or append the factor to the tip labels and print the resulting tree else: def _fun(x): if x.isLeaf: if x.factor is None: x.factor = default if method.lower() == "prepend": x.label = f"{x.factor}|{x.label}" else: x.label = f"{x.label}|{x.factor}" return x tree.tree = alg.treemap(tree.tree, _fun) if newick: print(sf.newick(tree)) else: print(sf.nexus(tree))