예제 #1
0
def phylogenetic_tree(extant_taxa, branch_lengths=(.01, .5)):
    names_as_strings = [str(i) for i in range(extant_taxa)]
    tree = PhyloTree(format=1)
    tree.populate(extant_taxa,
                  random_branches=True,
                  branch_range=branch_lengths,
                  names_library=names_as_strings)
    tree.dist = 0
    current_name = 2 * extant_taxa - 2
    for node in tree.traverse('preorder'):
        if not node.is_leaf():
            node.name = str(current_name)
            current_name -= 1
    return tree
예제 #2
0
def process_tree(treepath):
    ''' processes a tree to extract orthology relationships between target taxid and the rest
     of species, organized by orthology type and species code '''
    treepath = str(treepath)
    treepath = treepath.rstrip()
    t = PhyloTree(treepath, sp_naming_function=get_species)
    # traverse all leaves in tree file and get taxid
    leaf_count = 0
    for leaf in t:
        leaf_count += 1
        tax = int(leaf.name.split(".", 1)[0])

        #get scientific name and convert taxid from int to str
        sci_name = names.get(tax)
        leaf.taxid = str(tax)

        #rename leaves names
        try:
            good_name = "%s" % (conversion[leaf.name][0])
        except:
            good_name = leaf.name

        good_name = re.sub("[ |\t,:)(;\n\]\[]+", "_", good_name)
        leaf.good_name = good_name

    #obtain cluster name from tree file path
    clus_name = os.path.split(treepath)[-1].replace(".fa.final_tree.nw", "")
    try:
        base_name = conversion[clus_name][0].replace('|', '_')
    except:
        base_name = clus_name[0]
    t.dist = 0

    #colapses plat specific
    node2content = t.get_cached_content()
    target_species = set([target_taxid])

    def is_sp_specific(_node):
        _species = set([_leaf.species for _leaf in node2content[_node]])
        if not (_species - target_species):
            return True
        return False

    #traverse only lamprey leaves
    if collapse == 'yes':
        for n in t.get_leaves(is_leaf_fn=is_sp_specific):
            if n.children:
                for ch in n.get_children():
                    ch.detach()
                n.taxid = target_taxid
                n.name = "%s" % ('|'.join(
                    [_lf.name for _lf in node2content[n]]))
                n.good_name = "{%s}" % ('|'.join(
                    [_lf.good_name for _lf in node2content[n]]))

    #set outgroup
    outgroup = t.get_midpoint_outgroup()
    try:
        t.set_outgroup(outgroup)
    except:
        if len(t) == 1:
            return
        else:
            raise

    node2content = t.get_cached_content()

    event_lines = []
    for ev in t.get_descendant_evol_events():
        if ev.etype == "S":

            source_seqs = node2content[ev.node.children[0]]
            ortho_seqs = node2content[ev.node.children[1]]

            sp_1 = set()
            for leaf in source_seqs:
                sp_1.add(leaf.taxid)
            sp_2 = set()
            for leaf in ortho_seqs:
                sp_2.add(leaf.taxid)

            if str(target_taxid) in sp_1:
                source_seqs, ortho_seqs = source_seqs, ortho_seqs
            elif str(target_taxid) in sp_2:
                source_seqs, ortho_seqs = ortho_seqs, source_seqs
            else:
                continue

            #co_orthologs is a list with lamprey seed in source_seqs
            co_orthologs = [
                leaf.good_name for leaf in source_seqs
                if leaf.taxid == str(target_taxid)
            ]
            co_orthologs.sort()

            #orthologs is a list of all ortho_seqs names
            orthologs = defaultdict(set)
            for leaf in ortho_seqs:
                sp = int(leaf.taxid)
                orthologs[sp].add(leaf.good_name)

            if len(co_orthologs) == 1:
                _otype = "one-to-"
            else:
                _otype = "many-to-"

            for sp, orth in orthologs.iteritems():
                if len(orth) == 1:
                    otype = _otype + "one"
                else:
                    otype = _otype + "many"

                event_lines.append('\t'.join([
                    ','.join(co_orthologs), otype,
                    str(sp), names[sp], ','.join(sorted(orth)), '\n'
                ]))
    return event_lines
def process_tree(treepath):
    ''' processes a tree to extract orthology relationships between target taxid and the rest
     of species, organized by orthology type and species code '''
    treepath = str(treepath)
    treepath = treepath.rstrip()
    t = PhyloTree(treepath, sp_naming_function=get_species)
    treefile = os.path.basename(treepath)
    t.dist = 0

    outgroup = t.get_midpoint_outgroup()
    try:
        t.set_outgroup(outgroup)
        t.standardize()
    except:
        if args.pairs_table:
            if len(t) == 1:
                sys.stderr.write(treefile + 'len(t) == 1' + '\n')
                return ([], [])
                #return (['aa', 'aa'] ,[['aa', 'aa']])

            else:
                sys.stderr.write(treefile + 'len(t) != 1' + '\n')
                l = t.get_leaf_names()
                r = l[0]
                t.set_outgroup(r)
                pass
                #return ([],[])
                #return  (['None', 'None'] ,[['None', 'None']])
        else:
            if len(t) == 1:
                sys.stderr.write(treefile + 'len(t) == 1' + '\n')
                return []
            else:
                sys.stderr.write(treefile + 'len(t) != 1' + '\n')
                return []

    names = {}
    for leaf in t:
        try:
            sp = str(leaf.name.split('.')[0])
            leaf.taxid = str(sp)
            sci_name = ncbi.get_taxid_translator([sp])
            names[sp] = sci_name[int(sp)]

        except:
            names[sp] = ''

        if args.conv_table:
            try:
                good_name = "%s" % (conversion[leaf.name][0])
            except:
                good_name = leaf.name
            leaf.good_name = good_name

    node2content = t.get_cached_content()
    target_species = set([target_taxid])

    def is_sp_specific(_node):
        _species = set([_leaf.species for _leaf in node2content[_node]])
        if not (_species - target_species):
            return True
        return False

    #traverse only target taxid leaves
    if collapse == 'yes':
        for n in t.get_leaves(is_leaf_fn=is_sp_specific):
            if n.children:
                for ch in n.get_children():
                    ch.detach()
                n.taxid = target_taxid
                n.name = "{%s}" % ('|'.join(
                    [_lf.name for _lf in node2content[n]]))
                if args.conv_table:
                    n.good_name = "{%s}" % ('|'.join(
                        [_lf.good_name for _lf in node2content[n]]))

    all_ortholgs_tree = []
    all_ortholgs_pairs = []
    event_lines = []

    for ev in t.get_descendant_evol_events():
        if ev.etype == "S":
            source_seqs = ev.node.children[0]
            ortho_seqs = ev.node.children[1]

            if target_taxid:
                sp_1 = set()
                for leaf in source_seqs:
                    sp_1.add(leaf.taxid)
                sp_2 = set()
                for leaf in ortho_seqs:
                    sp_2.add(leaf.taxid)

                if str(target_taxid) in sp_1:
                    source_seqs, ortho_seqs = source_seqs, ortho_seqs
                elif str(target_taxid) in sp_2:
                    source_seqs, ortho_seqs = ortho_seqs, source_seqs
                else:
                    continue

            if args.conv_table:
                co_orthologs = [leaf.good_name for leaf in source_seqs]
                co_orthologs.sort()
            else:
                co_orthologs = [leaf.name for leaf in source_seqs]
                co_orthologs.sort()

            orthologs = defaultdict(set)
            for leaf in ortho_seqs:
                sp = str(leaf.name.split('.')[0])
                if args.conv_table:
                    orthologs[sp].add(leaf.good_name)
                else:
                    orthologs[sp].add(leaf.name)

            if len(source_seqs) == 1:
                _otype = "one-to-"
            else:
                _otype = "many-to-"

            for sp, orth in orthologs.items():
                if len(orth) == 1:
                    otype = _otype + "one"
                else:
                    otype = _otype + "many"

                event_lines.append('\t'.join([
                    ','.join(co_orthologs), otype,
                    str(sp), ','.join(sorted(orth)), treefile, names[sp], '\n'
                ]))

            if args.pairs_table:

                source_seqs_names = []
                ortho_seqs_names = []

                for node in source_seqs:
                    for leaf in node:
                        if args.conv_table:
                            name = leaf.good_name
                        else:
                            name = leaf.name
                        source_seqs_names.append(name)

                for node in ortho_seqs:
                    for leaf in node:
                        if args.conv_table:
                            name = leaf.good_name
                        else:
                            name = leaf.name
                        ortho_seqs_names.append(name)

                all_ortholgs_node = itertools.product(source_seqs_names,
                                                      ortho_seqs_names)
                all_ortholgs_tree.append(all_ortholgs_node)

                for node in all_ortholgs_tree:
                    for pair in node:
                        all_ortholgs_pairs.append(pair)

                #return (event_lines, all_ortholgs_pairs)

    if args.pairs_table:
        return (event_lines, all_ortholgs_pairs)
    else:
        return (event_lines)
예제 #4
0
    tree = PhyloTree('(Orangutan,Human,Chimp);')
    tree.link_to_alignment("""
                           >Chimp
                           HARWLNEKLRCELRTLKKLGLDGYKAVSQYVKGRA
                           >Orangutan
                           DARWINEKLRCVSRTLKKLGLDGYKGVSQYVKGRP
                           >Human
                           DARWHNVKLRCELRTLKKLGLVGFKAVSQFVIRRA
                           """)
    nt_sequences = {"Human"    : "GACGCACGGTGGCACAACGTAAAATTAAGATGTGAATTGAGAACTCTGAAAAAATTGGGACTGGTCGGCTTCAAGGCAGTAAGTCAATTCGTAATACGTCGTGCG",
                    "Chimp"    : "CACGCCCGATGGCTCAACGAAAAGTTAAGATGCGAATTGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGCAGTAAGTCAGTACGTTAAAGGTCGTGCG",
                    "Orangutan": "GATGCACGCTGGATCAACGAAAAGTTAAGATGCGTATCGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGGAGTAAGTCAATACGTTAAAGGTCGTCCG"
                }
    for l in nt_sequences:
        (tree & l).nt_sequence = nt_sequences[l]
    tree.dist = 0
    ts = TreeStyle()
    ts.title.add_face(TextFace("Example for nucleotides...", fsize=15), column=0)
    ts.layout_fn = test_layout_evol
    tree.show(tree_style=ts)

    # Show very large algs
    tree = PhyloTree('(Orangutan,Human,Chimp);')
    tree.link_to_alignment(">Human\n"       + ''.join([_aabgcolors.keys()[int(random() * len (_aabgcolors))] for _ in xrange (5000)]) + \
                           "\n>Chimp\n"     + ''.join([_aabgcolors.keys()[int(random() * len (_aabgcolors))] for _ in xrange (5000)]) + \
                           "\n>Orangutan\n" + ''.join([_aabgcolors.keys()[int(random() * len (_aabgcolors))] for _ in xrange (5000)]))
    tree.dist = 0
    ts = TreeStyle()
    ts.title.add_face(TextFace("better not set interactivity if alg is very large", fsize=15), column=0)
    ts.layout_fn = test_layout_phylo_aa
예제 #5
0
                           >Orangutan
                           DARWINEKLRCVSRTLKKLGLDGYKGVSQYVKGRP
                           >Human
                           DARWHNVKLRCELRTLKKLGLVGFKAVSQFVIRRA
                           """)
    nt_sequences = {
        "Human":
        "GACGCACGGTGGCACAACGTAAAATTAAGATGTGAATTGAGAACTCTGAAAAAATTGGGACTGGTCGGCTTCAAGGCAGTAAGTCAATTCGTAATACGTCGTGCG",
        "Chimp":
        "CACGCCCGATGGCTCAACGAAAAGTTAAGATGCGAATTGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGCAGTAAGTCAGTACGTTAAAGGTCGTGCG",
        "Orangutan":
        "GATGCACGCTGGATCAACGAAAAGTTAAGATGCGTATCGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGGAGTAAGTCAATACGTTAAAGGTCGTCCG"
    }
    for l in nt_sequences:
        (tree & l).nt_sequence = nt_sequences[l]
    tree.dist = 0
    ts = TreeStyle()
    ts.title.add_face(TextFace("Example for nucleotides...", fsize=15),
                      column=0)
    ts.layout_fn = test_layout_evol
    tree.show(tree_style=ts)

    # Show very large algs
    tree = PhyloTree('(Orangutan,Human,Chimp);')
    tree.link_to_alignment(">Human\n"       + ''.join([_aabgcolors.keys()[int(random() * len (_aabgcolors))] for _ in xrange (5000)]) + \
                           "\n>Chimp\n"     + ''.join([_aabgcolors.keys()[int(random() * len (_aabgcolors))] for _ in xrange (5000)]) + \
                           "\n>Orangutan\n" + ''.join([_aabgcolors.keys()[int(random() * len (_aabgcolors))] for _ in xrange (5000)]))
    tree.dist = 0
    ts = TreeStyle()
    ts.title.add_face(TextFace(
        "better not set interactivity if alg is very large", fsize=15),