class Test(unittest.TestCase): def setUp(self): self.io = StringIO(diag_str) self.io.seek(0) self.tree = Tree(tree_str) self.tree.node(0).get_data().id = 10 self.tree.node(1).get_data().id = 20 self.tree.node(2).get_data().id = 30 def tearDown(self): pass def testParser(self): parser = DiagCharsParser(self.tree) diagchars = list(parser.parse(self.io)) index = defaultdict(lambda: list()) for d in diagchars: index[d.tree_node_key].append(d) index[d.tree_node_key].sort(cmp=lambda x,y:cmp(x.column,y.column)) assert len(index[10]) == 5 assert len(index[20]) == 8 assert len(index[30]) == 5 assert index[10][0].column == 0 assert index[10][4].aa == "IK" assert index[20][2].column == 8 assert index[20][2].aa == "D"
def _subtree(self,root): """ Split a tree on a given node, pruning from the original tree. @param root: the node to use as the new root @return: subtree rooted on this node. """ sub = Tree(weight=self.tree.weight, rooted=self.tree.rooted, name=self.tree.name, data=self.tree.dataclass, max_support=self.tree.max_support) sub.node(sub.root).data = self.tree.node(root).data def _add(old_id,new_id): """ Walk from this node, using the id from the old tree, and the id from the new tree to both load the data from the old tree and link to the correct node in the new tree. """ for old_succ in self.tree.node(old_id).succ: #print old_id,new_id to_add = Node(data = self.tree.node(old_succ).data) new_succ = sub.add(to_add,new_id) #print "\t",old_succ,new_succ _add(old_succ,new_succ) _add(root,sub.root) self.annotater.annotate(sub) unlink(self.tree,root) return sub
def _subtree(self, root): # Find paths to targets to build a new tree g = self._subgraph(root) #for node in g: # print self.tree.node(node).data.taxon sub = Tree(weight=self.tree.weight, rooted=self.tree.rooted, name=self.tree.name, data=self.tree.dataclass, max_support=self.tree.max_support) sub.node(sub.root).data = self.tree.node(root).data def _add(old_id,new_id): """ Walk from this node, using the id from the old tree, and the id from the new tree to both load the data from the old tree and link to the correct node in the new tree. """ for old_succ in g.successors_iter(old_id): to_add = Node(data = self.tree.node(old_succ).data) new_succ = sub.add(to_add,new_id) _add(old_succ,new_succ) _add(root,sub.root) # Delete nodes from old tree for node in g: #print "collapsing node",node self.collapse(self.tree,node) return sub
class OIDImporter(object): """ Import a set of OID files into the database """ def __init__( self, familyName, alignFile, alignColcullLog, alignSeqcullLog, treeFile, treeDiagCharsFile, codemlFile=None, alignFormat="fasta", oid_key=None, ): self.familyName = familyName self.treeFile = treeFile self.treeDiagCharsFile = treeDiagCharsFile self.alignFile = alignFile self.alignColcullLog = alignColcullLog self.alignSeqcullLog = alignSeqcullLog self.codemlFile = codemlFile self.alignFormat = alignFormat self.oid_key = oid_key def merge(self): from hpf.hddb.db import Session, Family self.session = Session() self.family = self.session.query(Family).filter(Family.name == self.familyName).first() if not self.family: runtime().debug("Creating family", self.familyName) self._family() self._alignment() self._tree() else: self.alignment = self.family.alignment self.tree = self.alignment.tree runtime().debug("Found family", self.family.id) if not self.family.alignments[0].tree.codeml: runtime().debug("Importing codeml") self._codeml() else: runtime().debug("Already found codeml", self.family.alignments[0].tree.codeml.id) # Commit the session, close, and finish self.session.commit() self.session.close() def _index(self, name): n = name.split("#")[-1] if n.startswith("N"): n = n[1:] assert n.isdigit() return n def _tree(self): session = self.session # # Load the tree file and rename the taxa. # from Bio.Nexus.Nexus import Nexus # nex=Nexus(self.treeFile) # self.nexus = nex.trees[0] from Bio.Nexus.Trees import Tree as NewickTree tree_str = open(self.treeFile).read() self.nexus = NewickTree(tree_str) # Rename all the taxa. for id in self.nexus.get_terminals(): node = self.nexus.node(id) node.data.taxon = self._index(node.data.taxon) # Create the DB object from hpf.hddb.db import Tree self.tree = Tree( alignment_key=self.alignment.id, text=self.nexus.to_string(plain=False, plain_newick=True), filename=self.treeFile, ) session.add(self.tree) session.flush() # Now add in the node references self.nexus.name = self.tree.id assert self.tree.id != None runtime().debug("Added tree", self.tree) from hpf.hddb.db import TreeNodeFactory nodes = list(TreeNodeFactory().create(self.nexus)) for node in nodes: node.ancestor_node = node.ancestor.id if node.ancestor else None # This should add the new object into the session self.tree.nodes.append(node) # session.add(node) session.flush() runtime().debug("Appended", len(nodes), "tree nodes") session.flush() # Now import the diagnostic characters and reference the nodes. from hpf.amnh.oid import DiagCharsParser from hpf.hddb.db import TreeFactory biotree = TreeFactory(name_func=lambda node: str(node.id)).create(self.tree.nodes, self.tree.id) parser = DiagCharsParser(biotree) runtime().debug(self.treeDiagCharsFile) with open(self.treeDiagCharsFile) as handle: diagchars = list(parser.parse(handle)) runtime().debug("DiagChars", len(diagchars)) for d in diagchars: session.add(d) session.flush() def _codeml(self): if not self.codemlFile: return assert self.family.id != None assert self.tree.id != None # We need to convert the columns to the original alignment indices mapper = CulledColumnMapper(self.alignment, self.alignment.culled_columns) parser = PositiveSelectionParser() models = list(parser.parse(self.codemlFile)) runtime().debug("Found", len(models), "models") for i, model in enumerate(models): model.tree_key = self.tree.id self.session.add(model) self.session.flush() ps = list(model.ps) runtime().debug("Found", len(ps), "sites in model", model.model) for j, site in enumerate(ps): site.codeml_key = model.id # Indices in CodeML start at 1, convert to 0 and then map orig = site.column site.column = mapper[site.column - 1] runtime().debug("column", orig, "mapped to", site.column, site.probability) try: self.session.add(site) except: runtime().debug(i, ":", j, " failure on column", orig, "mapped to", site.column, site.probability) raise runtime().debug("Finished with model") self.session.flush() # with open(self.codemlFile) as handle: # text = handle.read() # from hpf.hddb.db import CodeML # self.codeml = CodeML(tree_key=self.tree.id, # filename=self.codemlFile, # text=text) # self.session.add(self.codeml) # self.session.flush() # parser = LRTParser(self.alignment, self.alignment.culled_columns,self.codeml) # with open(self.codemlFile) as handle: # for selection in parser.parse(handle): # selection.codeml_key = self.codeml.id # self.session.merge(selection) runtime().debug("finished import codeml") def _alignment(self): session = self.session # Read the alignment from Bio import AlignIO with open(self.alignFile) as handle: align = AlignIO.read(handle, self.alignFormat) # Rename 'id' with the correct protein key for record in align: record.id = self._index(record.id) # Write to a text buffer and create the DB object text = StringIO() AlignIO.write([align], text, self.alignFormat) from hpf.hddb.db import Alignment self.alignment = Alignment( family_key=self.family.id, format=self.alignFormat, filename=self.alignFile, text=text.getvalue() ) # Add to session and flush session.add(self.alignment) session.flush() # Flip through the proteins in the alignment and add # the records. for record in align: protein_key = record.id assert protein_key != 0 and protein_key != None, protein_key runtime().debug("protein: ", protein_key) from hpf.hddb.db import AlignmentProtein s = AlignmentProtein(alignment_key=self.alignment.id, protein_key=protein_key, sequence=str(record.seq)) session.add(s) session.flush() # There may exist multiple alignments, but the definition # of membership in the family is done here. from hpf.hddb.db import FamilyProtein fs = FamilyProtein(family_key=self.family.id, protein_key=protein_key, seed=True) session.merge(fs) # Now read the colulmn culling log. Indices start at 0 here. from hpf.hddb.db import AlignmentColcull, AlignmentSeqcull with open(self.alignColcullLog) as handle: for line in handle: column, gap, taxa, ratio = line.split() col = AlignmentColcull(alignment_key=self.alignment.id, column=column, gap_percentage=ratio) session.merge(col) with open(self.alignSeqcullLog) as handle: # rice#1182215 0.712765957446808 for line in handle: parts = line.split() seq, score = parts seq = self._index(seq) # seq.split("#")[-1] if not seq.isdigit(): print parts, "SEQ:", seq assert false cul = AlignmentSeqcull(alignment_key=self.alignment.id, protein_key=seq, score=score) session.flush() def _family(self): session = self.session from hpf.hddb.db import Family self.family = Family(name=self.familyName, experiment_key=0) session.add(self.family) session.flush()
### prune taxa we don't want ### alltaxa = mytreeobj.get_taxa() badtaxa = [] slowest_inparalogs = {} for taxon in alltaxa: if taxon not in oldid_newid.values(): badtaxa.append(taxon) else: sp = taxon.split('_bpgseq')[0] if sp in slowest_inparalogs: (old_taxon, old_brlen) = slowest_inparalogs[sp] new_brlen = mytreeobj.node( mytreeobj.search_taxon(taxon)).get_data().branchlength if new_brlen < old_brlen: slowest_inparalogs[sp] = ( taxon, mytreeobj.node(mytreeobj.search_taxon( taxon)).get_data().branchlength) badtaxa.append(old_taxon) else: badtaxa.append(taxon) else: slowest_inparalogs[sp] = (taxon, mytreeobj.node( mytreeobj.search_taxon(taxon) ).get_data().branchlength)
class OIDImporter(object): """ Import a set of OID files into the database """ def __init__(self, familyName, alignFile, alignColcullLog, alignSeqcullLog, treeFile, treeDiagCharsFile, codemlFile=None, alignFormat="fasta", oid_key=None): self.familyName = familyName self.treeFile = treeFile self.treeDiagCharsFile = treeDiagCharsFile self.alignFile = alignFile self.alignColcullLog = alignColcullLog self.alignSeqcullLog = alignSeqcullLog self.codemlFile = codemlFile self.alignFormat = alignFormat self.oid_key = oid_key def merge(self): from hpf.hddb.db import Session, Family self.session = Session() self.family = self.session.query(Family).filter( Family.name == self.familyName).first() if not self.family: runtime().debug("Creating family", self.familyName) self._family() self._alignment() self._tree() else: self.alignment = self.family.alignment self.tree = self.alignment.tree runtime().debug("Found family", self.family.id) if not self.family.alignments[0].tree.codeml: runtime().debug("Importing codeml") self._codeml() else: runtime().debug("Already found codeml", self.family.alignments[0].tree.codeml.id) # Commit the session, close, and finish self.session.commit() self.session.close() def _index(self, name): n = name.split("#")[-1] if n.startswith("N"): n = n[1:] assert n.isdigit() return n def _tree(self): session = self.session # # Load the tree file and rename the taxa. # from Bio.Nexus.Nexus import Nexus # nex=Nexus(self.treeFile) # self.nexus = nex.trees[0] from Bio.Nexus.Trees import Tree as NewickTree tree_str = open(self.treeFile).read() self.nexus = NewickTree(tree_str) # Rename all the taxa. for id in self.nexus.get_terminals(): node = self.nexus.node(id) node.data.taxon = self._index(node.data.taxon) # Create the DB object from hpf.hddb.db import Tree self.tree = Tree(alignment_key=self.alignment.id, text=self.nexus.to_string(plain=False, plain_newick=True), filename=self.treeFile) session.add(self.tree) session.flush() # Now add in the node references self.nexus.name = self.tree.id assert self.tree.id != None runtime().debug("Added tree", self.tree) from hpf.hddb.db import TreeNodeFactory nodes = list(TreeNodeFactory().create(self.nexus)) for node in nodes: node.ancestor_node = node.ancestor.id if node.ancestor else None # This should add the new object into the session self.tree.nodes.append(node) #session.add(node) session.flush() runtime().debug("Appended", len(nodes), "tree nodes") session.flush() # Now import the diagnostic characters and reference the nodes. from hpf.amnh.oid import DiagCharsParser from hpf.hddb.db import TreeFactory biotree = TreeFactory(name_func=lambda node: str(node.id)).create( self.tree.nodes, self.tree.id) parser = DiagCharsParser(biotree) runtime().debug(self.treeDiagCharsFile) with open(self.treeDiagCharsFile) as handle: diagchars = list(parser.parse(handle)) runtime().debug("DiagChars", len(diagchars)) for d in diagchars: session.add(d) session.flush() def _codeml(self): if not self.codemlFile: return assert self.family.id != None assert self.tree.id != None # We need to convert the columns to the original alignment indices mapper = CulledColumnMapper(self.alignment, self.alignment.culled_columns) parser = PositiveSelectionParser() models = list(parser.parse(self.codemlFile)) runtime().debug("Found", len(models), "models") for i, model in enumerate(models): model.tree_key = self.tree.id self.session.add(model) self.session.flush() ps = list(model.ps) runtime().debug("Found", len(ps), "sites in model", model.model) for j, site in enumerate(ps): site.codeml_key = model.id # Indices in CodeML start at 1, convert to 0 and then map orig = site.column site.column = mapper[site.column - 1] runtime().debug("column", orig, "mapped to", site.column, site.probability) try: self.session.add(site) except: runtime().debug(i, ":", j, " failure on column", orig, "mapped to", site.column, site.probability) raise runtime().debug("Finished with model") self.session.flush() # with open(self.codemlFile) as handle: # text = handle.read() # from hpf.hddb.db import CodeML # self.codeml = CodeML(tree_key=self.tree.id, # filename=self.codemlFile, # text=text) # self.session.add(self.codeml) # self.session.flush() # parser = LRTParser(self.alignment, self.alignment.culled_columns,self.codeml) # with open(self.codemlFile) as handle: # for selection in parser.parse(handle): # selection.codeml_key = self.codeml.id # self.session.merge(selection) runtime().debug("finished import codeml") def _alignment(self): session = self.session # Read the alignment from Bio import AlignIO with open(self.alignFile) as handle: align = AlignIO.read(handle, self.alignFormat) # Rename 'id' with the correct protein key for record in align: record.id = self._index(record.id) # Write to a text buffer and create the DB object text = StringIO() AlignIO.write([align], text, self.alignFormat) from hpf.hddb.db import Alignment self.alignment = Alignment(family_key=self.family.id, format=self.alignFormat, filename=self.alignFile, text=text.getvalue()) # Add to session and flush session.add(self.alignment) session.flush() # Flip through the proteins in the alignment and add # the records. for record in align: protein_key = record.id assert protein_key != 0 and protein_key != None, protein_key runtime().debug("protein: ", protein_key) from hpf.hddb.db import AlignmentProtein s = AlignmentProtein(alignment_key=self.alignment.id, protein_key=protein_key, sequence=str(record.seq)) session.add(s) session.flush() # There may exist multiple alignments, but the definition # of membership in the family is done here. from hpf.hddb.db import FamilyProtein fs = FamilyProtein(family_key=self.family.id, protein_key=protein_key, seed=True) session.merge(fs) # Now read the colulmn culling log. Indices start at 0 here. from hpf.hddb.db import AlignmentColcull, AlignmentSeqcull with open(self.alignColcullLog) as handle: for line in handle: column, gap, taxa, ratio = line.split() col = AlignmentColcull(alignment_key=self.alignment.id, column=column, gap_percentage=ratio) session.merge(col) with open(self.alignSeqcullLog) as handle: #rice#1182215 0.712765957446808 for line in handle: parts = line.split() seq, score = parts seq = self._index(seq) #seq.split("#")[-1] if not seq.isdigit(): print parts, "SEQ:", seq assert false cul = AlignmentSeqcull(alignment_key=self.alignment.id, protein_key=seq, score=score) session.flush() def _family(self): session = self.session from hpf.hddb.db import Family self.family = Family(name=self.familyName, experiment_key=0) session.add(self.family) session.flush()