def get_seed_or_collateral_tree(phyprot, phylome, p): """Return tree object for seed or collateral tree (if not seed tree) for the closest seedid. """ bestSeedid = "" tdata = p.get_tree(phyprot, phylome) #if no tree for protid as seed look for collateral trees if not tdata: #get seedids of collateral trees seedids = filter(lambda x: x[2] == phylome, p.get_collateral_seeds(phyprot)) bestSeedid = None smallerstDistance = 999999 bestTdata = None for seedid, seedspcode, phyid in seedids: if not seedid.startswith("Phy"): seedid = "Phy%s_%s" % (seedid, seedspcode) #tdata = p.get_best_tree(seedid, phylome) tdata = p.get_tree(seedid, phylome) if not tdata: continue bestMethod = sorted(tdata, key=lambda x: tdata[x]['lk'], reverse=True)[0] t = ete2.PhyloTree(tdata[bestMethod]['tree'], \ sp_naming_function=_get_spcode) d = t.get_distance(seedid, phyprot) if d < smallerstDistance: smallerstDistance = d bestTdata = tdata bestSeedid = seedid tdata = bestTdata if not tdata: return None, bestSeedid #get method giving best lk tree bestMethod = sorted(tdata, key=lambda x: tdata[x]['lk'], reverse=True)[0] #get tree reconstructed with that method bestTree = ete2.PhyloTree(tdata[bestMethod]['tree'], \ sp_naming_function=_get_spcode) return bestTree, bestSeedid
def reconcile_tree(gene_tree_file, reconciled_file, rec_tag, pfam_id, db): if (os.path.isfile(rec_tag + 'ids.pickle')) and (pplacer_flag == 1): id_information = pickle.load(open(rec_tag + 'ids.pickle', 'rb')) existing_genes = id_information['existing_genes'] Sequnces = [] p_ids = [] new_genes = set([w['id'] for w in pplacer_queries[pfam_id]]) if not (new_genes - set(existing_genes)): print "All %s Genes for family %s have already been placed in the reconciled tree." % ( len(new_genes), pfam_id) print "Skip Reconciliation for %s" % pfam_id return txid_file = rec_tag + 'txid.xml' if not (os.path.isfile(rec_tag + 'ids.pickle')) or not ( os.path.isfile(reconciled_file + '.gz')) or (pplacer_flag == 1): print "Running Reconciliation for: %s" % pfam_id rand_id = random.randint(1000000, 9999999) subprocess.check_call("gunzip -c %s/%s.nw.gz > %s.%d" % (tree_folder, pfam_id, gene_tree_file, rand_id), shell=True) tree = ete2.PhyloTree('%s.%d' % (gene_tree_file, rand_id), format=0) tree.resolve_polytomy() tree.write(format=0, outfile=txid_file + '.tmp.nw') if os.path.exists('%s.%d' % (gene_tree_file, rand_id)): subprocess.check_call("rm %s.%d" % (gene_tree_file, rand_id), shell=True) Phylo.convert(txid_file + '.tmp.nw', 'newick', txid_file + '.tmp.xml', 'phyloxml') treexml = PhyloXMLIO.read(open(txid_file + '.tmp.xml', 'r')) tree = treexml[0] treexml.attributes.pop('schemaLocation', None) # not supported by Forester tree.rooted = True my_ids = set([]) my_query_by_taxid = {} for leaf in tree.clade.find_clades(terminal=True): up_name = leaf.name.split('/')[0] tax_id, tax_name = find_tax_id_unip(up_name, db) if tax_id not in all_species_txids: if tax_id in merged_taxid.keys(): tax_id = merged_taxid[tax_id] tax_name = find_tax_name(tax_id, db) if tax_id in best_taxid_map.keys(): tax_id = best_taxid_map[tax_id] tax_name = find_tax_name(tax_id, db) else: tax_id0 = tax_id tax_id, tax_name = find_best_taxid(tax_id, db) if tax_id > 0: best_taxid_map[tax_id0] = tax_id if tax_id < 0: if (-tax_id) in merged_taxid.keys(): tax_id = merged_taxid[-tax_id] tax_name = find_tax_name(tax_id, db) if tax_id in my_query_by_taxid: my_query_by_taxid[tax_id].append(up_name) else: my_query_by_taxid[tax_id] = [up_name] my_ids.add(tax_id) my_tax_id = PhyloXML.Id(tax_id, provider='ncbi_taxonomy') taxon = PhyloXML.Taxonomy(id=my_tax_id) taxon.scientific_name = tax_name leaf._set_taxonomy(taxon) PhyloXMLIO.write(treexml, open(txid_file, 'w')) os.system('rm ' + txid_file + '.tmp.nw') os.system('rm ' + txid_file + '.tmp.xml') print "Taxid file done for: %s" % pfam_id existing_ids = list(set(my_ids) & set(all_species_txids)) existing_genes = [ g for txid in my_query_by_taxid.keys() for g in my_query_by_taxid[txid] if txid in existing_ids ] pickle.dump( { 'pfam_id': pfam_id, 'existing_ids': existing_ids, 'existing_genes': existing_genes }, open(rec_tag + 'ids.pickle', 'wb')) print "Pickle file done for: %s" % pfam_id if os.path.exists(reconciled_file): os.system('rm ' + reconciled_file) os.system( "java -Xmx4g -cp %s/forester_1038.jar org.forester.application.gsdi -g %s %s/ncbi_2_fixed.xml %s" % (lib_path, txid_file, species_tree_data_path, reconciled_file)) if os.path.exists(reconciled_file): if os.path.exists(reconciled_file + '.gz'): subprocess.check_call("rm %s.gz" % (reconciled_file), shell=True) subprocess.check_call("gzip %s" % (reconciled_file), shell=True) os.system('rm ' + rec_tag + 'reconciled_species_tree_used.xml') os.system('rm ' + rec_tag + 'reconciled_gsdi_log.txt') os.system('rm ' + txid_file) print "Reconciliation file done for: %s" % pfam_id
print "We will run on All %s Pfam Families of your novel species." % len( pfams) pfds = get_pfds(pfams, db_mysql) tree_sizes = {} for p in pfds.keys(): tree_sizes[p] = pfds[p]['num_full'] sorted_fams = sorted(pfds.keys(), key=lambda k: pfds[k]['num_full']) print "Number of families to process:", len(sorted_fams) print "\n\n--------------Reading the species tree data------------" # ##3-Extract tree information for each Pfam family all_species_txids_pickled = species_tree_data_path + '/all_species_txids.pickle' all_species_txids = pickle.load(open(all_species_txids_pickled)) orig_sp_tree_0 = ete2.PhyloTree(species_tree_data_path + '/ncbi.nw', format=0) zero_taxids = { 'HUMAN': 9606, '9CAUD': 70702, 'BABHY': 37552, 'ARATH': 3702, '9STAP': 1077965, 'SALTM': 99287, '9MYCO': 512402, '9RETR': 31697, 'BEABA': 176275, '9EURO': 1194637, '9BACE': 871324, '9CAEN': 1181719 }
def _make_tree_figure(self, tree, fig, colors, orders, root_name, scale=None, branch_vert_margin=None, fontsize=12, show_names=True, name_field='seq_id', rename_function=None, color_node_labels=False, label_colors=None, tree_orientation=0, min_order_fraction=0.1, show_root_name=False, chain=None, linked_alignment=None, alignment_fontsize=11, alignment_height=50, alignment_width=50, compact_alignment=False, scale_factor=1, linewidth=1, show_scale=False): if show_names is True: if chain == 'heavy': show_names = [ p.heavy[name_field] for p in self.pairs if p.heavy is not None ] else: show_names = [ p.light[name_field] for p in self.pairs if p.light is not None ] elif show_names is False: show_names = [] if show_root_name is True: show_names.append(root_name) if linked_alignment is not None: t = ete2.PhyloTree(tree, alignment=linked_alignment, alg_format='fasta') ete2.faces.SequenceItem = MySequenceItem else: t = ete2.Tree(tree) t.set_outgroup(t & root_name) # style the nodes for node in t.traverse(): if orders is not None: leaves = node.get_leaf_names() order_count = Counter([orders[l] for l in leaves]) for order in sorted(order_count.keys()): if float(order_count[order]) / len( leaves) >= min_order_fraction: color = colors[order] break else: color = colors.get(node.name, '#000000') if linked_alignment is not None: node.add_feature('aln_fontsize', alignment_fontsize) node.add_feature('aln_height', alignment_height) node.add_feature('aln_width', alignment_width) node.add_feature('fontsize', fontsize) node.add_feature('format', 'seq') node.add_feature('scale_factor', scale_factor) style = ete2.NodeStyle() style['size'] = 0 style['vt_line_width'] = float(linewidth) style['hz_line_width'] = float(linewidth) style['vt_line_color'] = color style['hz_line_color'] = color style['vt_line_type'] = 0 style['hz_line_type'] = 0 # else: # style['size'] = 0 # style['vt_line_width'] = float(linewidth) # style['hz_line_width'] = float(linewidth) # style['vt_line_color'] = color # style['hz_line_color'] = color # style['vt_line_type'] = 0 # style['hz_line_type'] = 0 if node.name in show_names: if color_node_labels: if label_colors is None: node_color = color elif type(label_colors) == dict: node_color = label_colors.get(node.name, '#000000') elif type(label_colors) in [list, tuple]: node_color = color if node.name in label_colors else '#000000' else: node_color = '#000000' else: node_color = '#000000' node_name = node.name if rename_function is None else rename_function( node.name) tf = ete2.TextFace(node_name, fsize=fontsize, fgcolor=node_color) # tf.fsize = fontsize node.add_face(tf, column=0) # style['fgcolor'] = hex_to_rgb(node_color) # else: # if hasattr(node, "sequence"): # node.add_face(ete2.SeqMotifFace(seq=node.sequence, # seqtype="aa", # height=50, # seq_format="seq"), column=0, position="aligned") node.set_style(style) t.dist = 0 ts = ete2.TreeStyle() if linked_alignment is not None: ts.layout_fn = self._phyloalignment_layout_function ts.orientation = tree_orientation ts.show_leaf_name = False if scale is not None: ts.scale = int(scale) if branch_vert_margin is not None: ts.branch_vertical_margin = float(branch_vert_margin) ts.show_scale = show_scale # ladderize t.ladderize() # render the tree t.render(fig, tree_style=ts)
raise Exception(cmd) cmd = 'rm %s/ncbi_taxonomy/syn.tab' % (lib_path) retcode = subprocess.call(cmd, shell=True) if not retcode == 0: raise Exception(cmd) cmd = 'rm %s/ncbi_taxonomy/taxa.sqlite' % (lib_path) retcode = subprocess.call(cmd, shell=True) if not retcode == 0: raise Exception(cmd) cmd = 'mv %s/ncbi_taxonomy/ncbi.nw %s/ncbi.nw' % (lib_path, tmp_folder) retcode = subprocess.call(cmd, shell=True) if not retcode == 0: raise Exception(cmd) sp_tree = ete2.PhyloTree(tmp_folder + "/ncbi.nw", format=0) sp_tree.write(format=0, outfile=tmp_folder + '/ncbi_2.nw') os.system( "java -Xmx4g -cp %s/forester_1038.jar org.forester.application.phyloxml_converter -f=nn %s/ncbi_2.nw %s/ncbi_2.xml" % (lib_path, tmp_folder, tmp_folder)) print "Add taxid information" prepare_species_tree(FILE_TREE_IN=tmp_folder + '/ncbi_2.xml', FILE_TREE_OUT=tmp_folder + '/ncbi_2_fixed.xml') print "Pickle all_species_txids" sp_tree_org = Phylo.read(tmp_folder + '/ncbi_2_fixed.xml', 'phyloxml') a1 = [
try: from rooted_phylomes import ROOTED_PHYLOMES rooters = True except: sys.stderr.write("No rooters (rooted_phylomes.py)\n") try: phyid, seed = int(sys.argv[1]), sys.argv[2] except: sys.stderr.write("No phylome_id and/or seed given\n") rooters = False def _get_spcode(protid): """Species naming function compatible with phylome_db3""" return protid.split('_')[-1] for i, nw in enumerate(sys.stdin, 1): if not i%100: sys.stderr.write(" %i \r"%i) t = ete2.PhyloTree(nw) t.set_species_naming_function(_get_spcode) try: seedNode = t.get_leaves_by_name(seed)[0] seedNode.get_farthest_oldest_node(ROOTED_PHYLOMES[phyid]) if rooters: sys.stderr.write("[WARNING] Cannot root tree %s with rooter\n"%i) except: t.set_outgroup(t.get_midpoint_outgroup()) print t.write(format=9)