예제 #1
0
def name_ancestors(timetreefile, to_table=False, ete3_algo=False, uniq=True):
    logger.info('Loading data')
    ### /!\ quoted_node_names only from ete3 v3.1.1
    timetree = PhyloTree(timetreefile, format=1, quoted_node_names=True)
    ncbi = NCBITaxa()


    name2taxid = ncbi.get_name_translator([sp.replace('_', ' ') for sp in \
                                                    timetree.get_leaf_names()])

    for leaf in timetree.get_leaves():
        try:
            leaf.add_feature('taxid', name2taxid[leaf.name.replace('_',
                                                                   ' ')][0])
        except KeyError:
            logger.warning('Species %r not found', leaf.name)
            leaf.delete(prevent_nondicotomic=True, preserve_branch_length=True)

    logger.info('Placing common ancestors')
    if ete3_algo:
        ncbi.annotate_tree(timetree, 'taxid')
    else:
        myannotate(timetree, ncbi)
    matchrename_ncbitax(timetree, uniq)

    #logger.debug({ft:getattr(timetree, ft) for ft in timetree.features})

    if not to_table:
        print(timetree.write(format=1, format_root_node=True))
    else:
        for node in timetree.traverse():
            if not node.is_leaf():
                print(node.oldname + '\t' + getattr(node, 'sci_name', ''))
예제 #2
0
def ultrametricer(node_order, tree_file):

    with open(tree_file) as f:
        mytree = PhyloTree(f.next().strip(), format=1)

    # First I get every single leaf

    leaves = mytree.get_leaves()

    # The total distance must be:

    v = len(leaves)

    # Now we get the expected distances
    distances = dict()
    for i, node in enumerate(node_order):

        distances[node] = i + 1

    for node in leaves:
        distances[node.name] = v

    # We add the root (that has no name)
    distances[""] = 0

    # We get the root

    root = mytree.get_tree_root()

    for node in leaves:
        #Now I start traversing to the root

        while (node.up):

            # The expected distance of this branch is:
            expected = distances[node.name] - distances[node.up.name]

            node.dist = expected

            node = node.up

    return mytree.write(format=1)
tphy = PhyloTree("/home/xavi/Documents/scripts/ete-proves/cyps.newick")
tsps = PhyloTree(
    "/home/xavi/Documents/scripts/ete-proves/cyps_sps_22mosquits.newick")


# In[]:
# assign species names to tree
def get_species_name(node_name_string):
    # Species code is the first part of leaf name (separated by an
    #  underscore character)
    spcode = node_name_string.split("_")[0]
    return spcode


tphy.set_species_naming_function(get_species_name)
for n in tphy.get_leaves():
    print("node:", n.name, "Species name:", n.species)

# In[]:
# find evolutionary events using tree reconciliation
tree_rec, evev_rec = tphy.reconcile(tsps)

# In[]:
print(tree_rec)
tree_rec.show()

# In[]:
# find evolutionary events using species overlap
evev = tphy.get_descendant_evol_events()

for ev in evev:
예제 #4
0
        seq2sp_dict[seq]=sp


def get_species_name(node_name_string):
    return seq2sp_dict[node_name_string]
    
def put_species_name(node_name_string):
    return node_name_string
    

# read the gene tree
genetree = PhyloTree(GeneTreeFilename, sp_naming_function=get_species_name)
sptree = PhyloTree(SpeciesTreeFilename, sp_naming_function=put_species_name)

logger.debug("Genetree")
for n in genetree.get_leaves():
    logger.debug("node: %s Species name: %s", n.name, n.species)

logger.debug("SpeciesTree")
for n in sptree.get_leaves():
    logger.debug("node: %s Species name: %s", n.name, n.species)

iS = 0
sp_dict = {}
for n in sptree.traverse():
    n.S=iS
    iS+=1
    if not n.is_leaf():
        n.name = n.S
    else:
        sp_dict[n.name] = n.S
예제 #5
0
# libraries
from ete3 import PhyloTree

# read tree from file
phy = PhyloTree("adar_hol.01.iqt.contree.newick")

# assign species names to tree
phy.set_species_naming_function(lambda node: node.name.split("_")[0])
for n in phy.get_leaves():
    print("node:", n.name, "Species name:", n.species)

# root tree
phy_outgroup = phy.get_midpoint_outgroup()
phy.set_outgroup(phy_outgroup)

# find evolutionary events
evev = phy.get_descendant_evol_events(sos_thr=0.9)

for ev in evev:
    if ev.etype == "S":
        print(ev.orthologs)

# find evolutionary events
evev = phy.get_descendant_evol_events(sos_thr=0.9)

# all events
for ev in evev:
    print(ev.etype, ','.join(ev.in_seqs), "<====>", ','.join(ev.out_seqs))

# all events involving either Hsap or Drer
fseqs = lambda slist: [
예제 #6
0
t = PhyloTree("(((Hsa_001,Ptr_001),(Cfa_001,Mms_001)),(Dme_001,Dme_002));")
#                              /-Hsa_001
#                    /--------|
#                   |          \-Ptr_001
#          /--------|
#         |         |          /-Cfa_001
#         |          \--------|
#---------|                    \-Mms_001
#         |
#         |          /-Dme_001
#          \--------|
#                    \-Dme_002
#
# Prints current leaf names and species codes
print "Deafult mode:"
for n in t.get_leaves():
    print "node:", n.name, "Species name:", n.species
# node: Dme_001 Species name: Dme
# node: Dme_002 Species name: Dme
# node: Hsa_001 Species name: Hsa
# node: Ptr_001 Species name: Ptr
# node: Cfa_001 Species name: Cfa
# node: Mms_001 Species name: Mms
#
# We can also use our own leaf name parsing function to obtain species
# names. All we need to do is create a python function that takes
# node's name as argument and return its corresponding species name.
def get_species_name(node_name_string):
    # Species code is the first part of leaf name (separated by an
    #  underscore character)
    spcode = node_name_string.split("_")[0]
예제 #7
0
def process_tree(treepath):
    ''' processes a tree to extract orthology relationships between target taxid and the rest
     of species, organized by orthology type and species code '''
    treepath = str(treepath)
    treepath = treepath.rstrip()
    t = PhyloTree(treepath, sp_naming_function=get_species)
    # traverse all leaves in tree file and get taxid
    leaf_count = 0
    for leaf in t:
        leaf_count += 1
        tax = int(leaf.name.split(".", 1)[0])

        #get scientific name and convert taxid from int to str
        sci_name = names.get(tax)
        leaf.taxid = str(tax)

        #rename leaves names
        try:
            good_name = "%s" % (conversion[leaf.name][0])
        except:
            good_name = leaf.name

        good_name = re.sub("[ |\t,:)(;\n\]\[]+", "_", good_name)
        leaf.good_name = good_name

    #obtain cluster name from tree file path
    clus_name = os.path.split(treepath)[-1].replace(".fa.final_tree.nw", "")
    try:
        base_name = conversion[clus_name][0].replace('|', '_')
    except:
        base_name = clus_name[0]
    t.dist = 0

    #colapses plat specific
    node2content = t.get_cached_content()
    target_species = set([target_taxid])

    def is_sp_specific(_node):
        _species = set([_leaf.species for _leaf in node2content[_node]])
        if not (_species - target_species):
            return True
        return False

    #traverse only lamprey leaves
    if collapse == 'yes':
        for n in t.get_leaves(is_leaf_fn=is_sp_specific):
            if n.children:
                for ch in n.get_children():
                    ch.detach()
                n.taxid = target_taxid
                n.name = "%s" % ('|'.join(
                    [_lf.name for _lf in node2content[n]]))
                n.good_name = "{%s}" % ('|'.join(
                    [_lf.good_name for _lf in node2content[n]]))

    #set outgroup
    outgroup = t.get_midpoint_outgroup()
    try:
        t.set_outgroup(outgroup)
    except:
        if len(t) == 1:
            return
        else:
            raise

    node2content = t.get_cached_content()

    event_lines = []
    for ev in t.get_descendant_evol_events():
        if ev.etype == "S":

            source_seqs = node2content[ev.node.children[0]]
            ortho_seqs = node2content[ev.node.children[1]]

            sp_1 = set()
            for leaf in source_seqs:
                sp_1.add(leaf.taxid)
            sp_2 = set()
            for leaf in ortho_seqs:
                sp_2.add(leaf.taxid)

            if str(target_taxid) in sp_1:
                source_seqs, ortho_seqs = source_seqs, ortho_seqs
            elif str(target_taxid) in sp_2:
                source_seqs, ortho_seqs = ortho_seqs, source_seqs
            else:
                continue

            #co_orthologs is a list with lamprey seed in source_seqs
            co_orthologs = [
                leaf.good_name for leaf in source_seqs
                if leaf.taxid == str(target_taxid)
            ]
            co_orthologs.sort()

            #orthologs is a list of all ortho_seqs names
            orthologs = defaultdict(set)
            for leaf in ortho_seqs:
                sp = int(leaf.taxid)
                orthologs[sp].add(leaf.good_name)

            if len(co_orthologs) == 1:
                _otype = "one-to-"
            else:
                _otype = "many-to-"

            for sp, orth in orthologs.iteritems():
                if len(orth) == 1:
                    otype = _otype + "one"
                else:
                    otype = _otype + "many"

                event_lines.append('\t'.join([
                    ','.join(co_orthologs), otype,
                    str(sp), names[sp], ','.join(sorted(orth)), '\n'
                ]))
    return event_lines
예제 #8
0
	        return pickle.load(f)

	genedict = load_obj('genedict')
	speciescolors = load_obj('colors')
	red = Color('red')
	blue = Color('blue')
	colorvec = list(red.range_to(blue, len(genedict)))
	colormap = {}
	columnmap = {}
	for i,fasta in enumerate(genedict):
		columnmap[fasta] = i
		colormap[fasta] = colorvec[i].hex
	annotated = [] 
	print speciescolors
	for fasta in genedict:
		for leaf in t.get_leaves():

			nst = NodeStyle()
			nst["size"] = 0
			nst["fgcolor"] = 'black'
			nst["hz_line_width"] = 2
			nst["vt_line_width"]= 2
			nst.show_name = True
			if leaf.name.split('/')[0] in genedict[fasta]:
				if 'HH' not in fasta and 'LOMETS' not in fasta: 
					leaf.add_face( RectFace ( 10 , 10 , colormap[fasta], colormap[fasta] ), column = columnmap[fasta] )
					if leaf not in annotated:
						try:
							face = leaf.add_face( TextFace ( text = genedict[fasta][leaf.name.split('/')[0]][2]) , column = 10  )
							annotated.append(leaf)
						except:
def process_tree(treepath):
    ''' processes a tree to extract orthology relationships between target taxid and the rest
     of species, organized by orthology type and species code '''
    treepath = str(treepath)
    treepath = treepath.rstrip()
    t = PhyloTree(treepath, sp_naming_function=get_species)
    treefile = os.path.basename(treepath)
    t.dist = 0

    outgroup = t.get_midpoint_outgroup()
    try:
        t.set_outgroup(outgroup)
        t.standardize()
    except:
        if args.pairs_table:
            if len(t) == 1:
                sys.stderr.write(treefile + 'len(t) == 1' + '\n')
                return ([], [])
                #return (['aa', 'aa'] ,[['aa', 'aa']])

            else:
                sys.stderr.write(treefile + 'len(t) != 1' + '\n')
                l = t.get_leaf_names()
                r = l[0]
                t.set_outgroup(r)
                pass
                #return ([],[])
                #return  (['None', 'None'] ,[['None', 'None']])
        else:
            if len(t) == 1:
                sys.stderr.write(treefile + 'len(t) == 1' + '\n')
                return []
            else:
                sys.stderr.write(treefile + 'len(t) != 1' + '\n')
                return []

    names = {}
    for leaf in t:
        try:
            sp = str(leaf.name.split('.')[0])
            leaf.taxid = str(sp)
            sci_name = ncbi.get_taxid_translator([sp])
            names[sp] = sci_name[int(sp)]

        except:
            names[sp] = ''

        if args.conv_table:
            try:
                good_name = "%s" % (conversion[leaf.name][0])
            except:
                good_name = leaf.name
            leaf.good_name = good_name

    node2content = t.get_cached_content()
    target_species = set([target_taxid])

    def is_sp_specific(_node):
        _species = set([_leaf.species for _leaf in node2content[_node]])
        if not (_species - target_species):
            return True
        return False

    #traverse only target taxid leaves
    if collapse == 'yes':
        for n in t.get_leaves(is_leaf_fn=is_sp_specific):
            if n.children:
                for ch in n.get_children():
                    ch.detach()
                n.taxid = target_taxid
                n.name = "{%s}" % ('|'.join(
                    [_lf.name for _lf in node2content[n]]))
                if args.conv_table:
                    n.good_name = "{%s}" % ('|'.join(
                        [_lf.good_name for _lf in node2content[n]]))

    all_ortholgs_tree = []
    all_ortholgs_pairs = []
    event_lines = []

    for ev in t.get_descendant_evol_events():
        if ev.etype == "S":
            source_seqs = ev.node.children[0]
            ortho_seqs = ev.node.children[1]

            if target_taxid:
                sp_1 = set()
                for leaf in source_seqs:
                    sp_1.add(leaf.taxid)
                sp_2 = set()
                for leaf in ortho_seqs:
                    sp_2.add(leaf.taxid)

                if str(target_taxid) in sp_1:
                    source_seqs, ortho_seqs = source_seqs, ortho_seqs
                elif str(target_taxid) in sp_2:
                    source_seqs, ortho_seqs = ortho_seqs, source_seqs
                else:
                    continue

            if args.conv_table:
                co_orthologs = [leaf.good_name for leaf in source_seqs]
                co_orthologs.sort()
            else:
                co_orthologs = [leaf.name for leaf in source_seqs]
                co_orthologs.sort()

            orthologs = defaultdict(set)
            for leaf in ortho_seqs:
                sp = str(leaf.name.split('.')[0])
                if args.conv_table:
                    orthologs[sp].add(leaf.good_name)
                else:
                    orthologs[sp].add(leaf.name)

            if len(source_seqs) == 1:
                _otype = "one-to-"
            else:
                _otype = "many-to-"

            for sp, orth in orthologs.items():
                if len(orth) == 1:
                    otype = _otype + "one"
                else:
                    otype = _otype + "many"

                event_lines.append('\t'.join([
                    ','.join(co_orthologs), otype,
                    str(sp), ','.join(sorted(orth)), treefile, names[sp], '\n'
                ]))

            if args.pairs_table:

                source_seqs_names = []
                ortho_seqs_names = []

                for node in source_seqs:
                    for leaf in node:
                        if args.conv_table:
                            name = leaf.good_name
                        else:
                            name = leaf.name
                        source_seqs_names.append(name)

                for node in ortho_seqs:
                    for leaf in node:
                        if args.conv_table:
                            name = leaf.good_name
                        else:
                            name = leaf.name
                        ortho_seqs_names.append(name)

                all_ortholgs_node = itertools.product(source_seqs_names,
                                                      ortho_seqs_names)
                all_ortholgs_tree.append(all_ortholgs_node)

                for node in all_ortholgs_tree:
                    for pair in node:
                        all_ortholgs_pairs.append(pair)

                #return (event_lines, all_ortholgs_pairs)

    if args.pairs_table:
        return (event_lines, all_ortholgs_pairs)
    else:
        return (event_lines)
예제 #10
0
t = PhyloTree("(((Hsa_001,Ptr_001),(Cfa_001,Mms_001)),(Dme_001,Dme_002));")
#                              /-Hsa_001
#                    /--------|
#                   |          \-Ptr_001
#          /--------|
#         |         |          /-Cfa_001
#         |          \--------|
# ---------|                    \-Mms_001
#         |
#         |          /-Dme_001
#          \--------|
#                    \-Dme_002
#
# Prints current leaf names and species codes
print "Deafult mode:"
for n in t.get_leaves():
    print "node:", n.name, "Species name:", n.species
# node: Dme_001 Species name: Dme
# node: Dme_002 Species name: Dme
# node: Hsa_001 Species name: Hsa
# node: Ptr_001 Species name: Ptr
# node: Cfa_001 Species name: Cfa
# node: Mms_001 Species name: Mms
#
# We can also use our own leaf name parsing function to obtain species
# names. All we need to do is create a python function that takes
# node's name as argument and return its corresponding species name.
def get_species_name(node_name_string):
    # Species code is the first part of leaf name (separated by an
    #  underscore character)
    spcode = node_name_string.split("_")[0]