예제 #1
0
def get_seed_or_collateral_tree(phyprot, phylome, p):
    """Return tree object for seed or collateral tree (if not seed tree) 
    for the closest seedid.
    """
    bestSeedid = ""
    tdata = p.get_tree(phyprot, phylome)
    #if no tree for protid as seed look for collateral trees
    if not tdata:
        #get seedids of collateral trees
        seedids = filter(lambda x: x[2] == phylome,
                         p.get_collateral_seeds(phyprot))
        bestSeedid = None
        smallerstDistance = 999999
        bestTdata = None
        for seedid, seedspcode, phyid in seedids:
            if not seedid.startswith("Phy"):
                seedid = "Phy%s_%s" % (seedid, seedspcode)
            #tdata = p.get_best_tree(seedid, phylome)
            tdata = p.get_tree(seedid, phylome)
            if not tdata:
                continue
            bestMethod = sorted(tdata,
                                key=lambda x: tdata[x]['lk'],
                                reverse=True)[0]
            t = ete2.PhyloTree(tdata[bestMethod]['tree'], \
                               sp_naming_function=_get_spcode)
            d = t.get_distance(seedid, phyprot)
            if d < smallerstDistance:
                smallerstDistance = d
                bestTdata = tdata
                bestSeedid = seedid
        tdata = bestTdata

    if not tdata:
        return None, bestSeedid

    #get method giving best lk tree
    bestMethod = sorted(tdata, key=lambda x: tdata[x]['lk'], reverse=True)[0]

    #get tree reconstructed with that method
    bestTree = ete2.PhyloTree(tdata[bestMethod]['tree'], \
                              sp_naming_function=_get_spcode)

    return bestTree, bestSeedid
예제 #2
0
def reconcile_tree(gene_tree_file, reconciled_file, rec_tag, pfam_id, db):
    if (os.path.isfile(rec_tag + 'ids.pickle')) and (pplacer_flag == 1):
        id_information = pickle.load(open(rec_tag + 'ids.pickle', 'rb'))
        existing_genes = id_information['existing_genes']
        Sequnces = []
        p_ids = []
        new_genes = set([w['id'] for w in pplacer_queries[pfam_id]])
        if not (new_genes - set(existing_genes)):
            print "All %s Genes for family %s have already been placed in the reconciled tree." % (
                len(new_genes), pfam_id)
            print "Skip Reconciliation for %s" % pfam_id
            return

    txid_file = rec_tag + 'txid.xml'
    if not (os.path.isfile(rec_tag + 'ids.pickle')) or not (
            os.path.isfile(reconciled_file + '.gz')) or (pplacer_flag == 1):
        print "Running Reconciliation for: %s" % pfam_id

        rand_id = random.randint(1000000, 9999999)
        subprocess.check_call("gunzip -c %s/%s.nw.gz > %s.%d" %
                              (tree_folder, pfam_id, gene_tree_file, rand_id),
                              shell=True)
        tree = ete2.PhyloTree('%s.%d' % (gene_tree_file, rand_id), format=0)
        tree.resolve_polytomy()
        tree.write(format=0, outfile=txid_file + '.tmp.nw')
        if os.path.exists('%s.%d' % (gene_tree_file, rand_id)):
            subprocess.check_call("rm  %s.%d" % (gene_tree_file, rand_id),
                                  shell=True)

        Phylo.convert(txid_file + '.tmp.nw', 'newick', txid_file + '.tmp.xml',
                      'phyloxml')
        treexml = PhyloXMLIO.read(open(txid_file + '.tmp.xml', 'r'))
        tree = treexml[0]
        treexml.attributes.pop('schemaLocation',
                               None)  # not supported by Forester
        tree.rooted = True
        my_ids = set([])
        my_query_by_taxid = {}
        for leaf in tree.clade.find_clades(terminal=True):
            up_name = leaf.name.split('/')[0]
            tax_id, tax_name = find_tax_id_unip(up_name, db)
            if tax_id not in all_species_txids:
                if tax_id in merged_taxid.keys():
                    tax_id = merged_taxid[tax_id]
                    tax_name = find_tax_name(tax_id, db)
                if tax_id in best_taxid_map.keys():
                    tax_id = best_taxid_map[tax_id]
                    tax_name = find_tax_name(tax_id, db)
                else:
                    tax_id0 = tax_id
                    tax_id, tax_name = find_best_taxid(tax_id, db)
                    if tax_id > 0:
                        best_taxid_map[tax_id0] = tax_id
            if tax_id < 0:
                if (-tax_id) in merged_taxid.keys():
                    tax_id = merged_taxid[-tax_id]
                    tax_name = find_tax_name(tax_id, db)
            if tax_id in my_query_by_taxid:
                my_query_by_taxid[tax_id].append(up_name)
            else:
                my_query_by_taxid[tax_id] = [up_name]
            my_ids.add(tax_id)
            my_tax_id = PhyloXML.Id(tax_id, provider='ncbi_taxonomy')
            taxon = PhyloXML.Taxonomy(id=my_tax_id)
            taxon.scientific_name = tax_name
            leaf._set_taxonomy(taxon)
        PhyloXMLIO.write(treexml, open(txid_file, 'w'))
        os.system('rm ' + txid_file + '.tmp.nw')
        os.system('rm ' + txid_file + '.tmp.xml')
        print "Taxid file done for: %s" % pfam_id
        existing_ids = list(set(my_ids) & set(all_species_txids))
        existing_genes = [
            g for txid in my_query_by_taxid.keys()
            for g in my_query_by_taxid[txid] if txid in existing_ids
        ]
        pickle.dump(
            {
                'pfam_id': pfam_id,
                'existing_ids': existing_ids,
                'existing_genes': existing_genes
            }, open(rec_tag + 'ids.pickle', 'wb'))
        print "Pickle file done for: %s" % pfam_id

    if os.path.exists(reconciled_file):
        os.system('rm ' + reconciled_file)
    os.system(
        "java -Xmx4g -cp %s/forester_1038.jar org.forester.application.gsdi -g %s %s/ncbi_2_fixed.xml %s"
        % (lib_path, txid_file, species_tree_data_path, reconciled_file))
    if os.path.exists(reconciled_file):
        if os.path.exists(reconciled_file + '.gz'):
            subprocess.check_call("rm  %s.gz" % (reconciled_file), shell=True)
        subprocess.check_call("gzip %s" % (reconciled_file), shell=True)
    os.system('rm ' + rec_tag + 'reconciled_species_tree_used.xml')
    os.system('rm ' + rec_tag + 'reconciled_gsdi_log.txt')
    os.system('rm ' + txid_file)
    print "Reconciliation file done for: %s" % pfam_id
예제 #3
0
            print "We will run on All %s Pfam Families of your novel species." % len(
                pfams)

    pfds = get_pfds(pfams, db_mysql)

    tree_sizes = {}
    for p in pfds.keys():
        tree_sizes[p] = pfds[p]['num_full']
    sorted_fams = sorted(pfds.keys(), key=lambda k: pfds[k]['num_full'])
    print "Number of families to process:", len(sorted_fams)

    print "\n\n--------------Reading the species tree data------------"
    # ##3-Extract tree information for each Pfam family
    all_species_txids_pickled = species_tree_data_path + '/all_species_txids.pickle'
    all_species_txids = pickle.load(open(all_species_txids_pickled))
    orig_sp_tree_0 = ete2.PhyloTree(species_tree_data_path + '/ncbi.nw',
                                    format=0)

    zero_taxids = {
        'HUMAN': 9606,
        '9CAUD': 70702,
        'BABHY': 37552,
        'ARATH': 3702,
        '9STAP': 1077965,
        'SALTM': 99287,
        '9MYCO': 512402,
        '9RETR': 31697,
        'BEABA': 176275,
        '9EURO': 1194637,
        '9BACE': 871324,
        '9CAEN': 1181719
    }
예제 #4
0
 def _make_tree_figure(self,
                       tree,
                       fig,
                       colors,
                       orders,
                       root_name,
                       scale=None,
                       branch_vert_margin=None,
                       fontsize=12,
                       show_names=True,
                       name_field='seq_id',
                       rename_function=None,
                       color_node_labels=False,
                       label_colors=None,
                       tree_orientation=0,
                       min_order_fraction=0.1,
                       show_root_name=False,
                       chain=None,
                       linked_alignment=None,
                       alignment_fontsize=11,
                       alignment_height=50,
                       alignment_width=50,
                       compact_alignment=False,
                       scale_factor=1,
                       linewidth=1,
                       show_scale=False):
     if show_names is True:
         if chain == 'heavy':
             show_names = [
                 p.heavy[name_field] for p in self.pairs
                 if p.heavy is not None
             ]
         else:
             show_names = [
                 p.light[name_field] for p in self.pairs
                 if p.light is not None
             ]
     elif show_names is False:
         show_names = []
     if show_root_name is True:
         show_names.append(root_name)
     if linked_alignment is not None:
         t = ete2.PhyloTree(tree,
                            alignment=linked_alignment,
                            alg_format='fasta')
         ete2.faces.SequenceItem = MySequenceItem
     else:
         t = ete2.Tree(tree)
     t.set_outgroup(t & root_name)
     # style the nodes
     for node in t.traverse():
         if orders is not None:
             leaves = node.get_leaf_names()
             order_count = Counter([orders[l] for l in leaves])
             for order in sorted(order_count.keys()):
                 if float(order_count[order]) / len(
                         leaves) >= min_order_fraction:
                     color = colors[order]
                     break
         else:
             color = colors.get(node.name, '#000000')
         if linked_alignment is not None:
             node.add_feature('aln_fontsize', alignment_fontsize)
             node.add_feature('aln_height', alignment_height)
             node.add_feature('aln_width', alignment_width)
             node.add_feature('fontsize', fontsize)
             node.add_feature('format', 'seq')
             node.add_feature('scale_factor', scale_factor)
         style = ete2.NodeStyle()
         style['size'] = 0
         style['vt_line_width'] = float(linewidth)
         style['hz_line_width'] = float(linewidth)
         style['vt_line_color'] = color
         style['hz_line_color'] = color
         style['vt_line_type'] = 0
         style['hz_line_type'] = 0
         # else:
         #     style['size'] = 0
         #     style['vt_line_width'] = float(linewidth)
         #     style['hz_line_width'] = float(linewidth)
         #     style['vt_line_color'] = color
         #     style['hz_line_color'] = color
         #     style['vt_line_type'] = 0
         #     style['hz_line_type'] = 0
         if node.name in show_names:
             if color_node_labels:
                 if label_colors is None:
                     node_color = color
                 elif type(label_colors) == dict:
                     node_color = label_colors.get(node.name, '#000000')
                 elif type(label_colors) in [list, tuple]:
                     node_color = color if node.name in label_colors else '#000000'
                 else:
                     node_color = '#000000'
             else:
                 node_color = '#000000'
             node_name = node.name if rename_function is None else rename_function(
                 node.name)
             tf = ete2.TextFace(node_name,
                                fsize=fontsize,
                                fgcolor=node_color)
             # tf.fsize = fontsize
             node.add_face(tf, column=0)
             # style['fgcolor'] = hex_to_rgb(node_color)
         # else:
         #     if hasattr(node, "sequence"):
         #         node.add_face(ete2.SeqMotifFace(seq=node.sequence,
         #                                         seqtype="aa",
         #                                         height=50,
         #                                         seq_format="seq"), column=0, position="aligned")
         node.set_style(style)
     t.dist = 0
     ts = ete2.TreeStyle()
     if linked_alignment is not None:
         ts.layout_fn = self._phyloalignment_layout_function
     ts.orientation = tree_orientation
     ts.show_leaf_name = False
     if scale is not None:
         ts.scale = int(scale)
     if branch_vert_margin is not None:
         ts.branch_vertical_margin = float(branch_vert_margin)
     ts.show_scale = show_scale
     # ladderize
     t.ladderize()
     # render the tree
     t.render(fig, tree_style=ts)
예제 #5
0
                    raise Exception(cmd)
                cmd = 'rm  %s/ncbi_taxonomy/syn.tab' % (lib_path)
                retcode = subprocess.call(cmd, shell=True)
                if not retcode == 0:
                    raise Exception(cmd)
                cmd = 'rm  %s/ncbi_taxonomy/taxa.sqlite' % (lib_path)
                retcode = subprocess.call(cmd, shell=True)
                if not retcode == 0:
                    raise Exception(cmd)
                cmd = 'mv  %s/ncbi_taxonomy/ncbi.nw %s/ncbi.nw' % (lib_path,
                                                                   tmp_folder)
                retcode = subprocess.call(cmd, shell=True)
                if not retcode == 0:
                    raise Exception(cmd)

                sp_tree = ete2.PhyloTree(tmp_folder + "/ncbi.nw", format=0)
                sp_tree.write(format=0, outfile=tmp_folder + '/ncbi_2.nw')

                os.system(
                    "java -Xmx4g -cp %s/forester_1038.jar org.forester.application.phyloxml_converter -f=nn %s/ncbi_2.nw %s/ncbi_2.xml"
                    % (lib_path, tmp_folder, tmp_folder))

                print "Add taxid information"
                prepare_species_tree(FILE_TREE_IN=tmp_folder + '/ncbi_2.xml',
                                     FILE_TREE_OUT=tmp_folder +
                                     '/ncbi_2_fixed.xml')

                print "Pickle all_species_txids"
                sp_tree_org = Phylo.read(tmp_folder + '/ncbi_2_fixed.xml',
                                         'phyloxml')
                a1 = [
예제 #6
0
try:
	from rooted_phylomes import ROOTED_PHYLOMES
	rooters = True
except:
	sys.stderr.write("No rooters (rooted_phylomes.py)\n")
try:	
	phyid, seed = int(sys.argv[1]), sys.argv[2]
except:
	sys.stderr.write("No phylome_id and/or seed given\n")
	rooters = False

def _get_spcode(protid):
    """Species naming function compatible with phylome_db3"""
    return protid.split('_')[-1]

for i, nw in enumerate(sys.stdin, 1):
	if not i%100:
		sys.stderr.write(" %i   \r"%i)
	t = ete2.PhyloTree(nw)
	t.set_species_naming_function(_get_spcode) 
	try:
		seedNode = t.get_leaves_by_name(seed)[0]
		seedNode.get_farthest_oldest_node(ROOTED_PHYLOMES[phyid])
		if rooters:
			sys.stderr.write("[WARNING] Cannot root tree %s with rooter\n"%i)
	except:
		t.set_outgroup(t.get_midpoint_outgroup())
	print t.write(format=9)