예제 #1
0
def name_ancestors(timetreefile, to_table=False, ete3_algo=False, uniq=True):
    logger.info('Loading data')
    ### /!\ quoted_node_names only from ete3 v3.1.1
    timetree = PhyloTree(timetreefile, format=1, quoted_node_names=True)
    ncbi = NCBITaxa()


    name2taxid = ncbi.get_name_translator([sp.replace('_', ' ') for sp in \
                                                    timetree.get_leaf_names()])

    for leaf in timetree.get_leaves():
        try:
            leaf.add_feature('taxid', name2taxid[leaf.name.replace('_',
                                                                   ' ')][0])
        except KeyError:
            logger.warning('Species %r not found', leaf.name)
            leaf.delete(prevent_nondicotomic=True, preserve_branch_length=True)

    logger.info('Placing common ancestors')
    if ete3_algo:
        ncbi.annotate_tree(timetree, 'taxid')
    else:
        myannotate(timetree, ncbi)
    matchrename_ncbitax(timetree, uniq)

    #logger.debug({ft:getattr(timetree, ft) for ft in timetree.features})

    if not to_table:
        print(timetree.write(format=1, format_root_node=True))
    else:
        for node in timetree.traverse():
            if not node.is_leaf():
                print(node.oldname + '\t' + getattr(node, 'sci_name', ''))
예제 #2
0
def main():
    parser = argparse.ArgumentParser(description='Gene Copy Number Finder')
    parser.add_argument('--genetree',
                        required=True,
                        help='GeneTree in nhx format')
    parser.add_argument('--speciesorder',
                        required=True,
                        help='Comma-separated species list')
    args = parser.parse_args()

    species_list = args.speciesorder.split(",")
    species_list = [_.strip() for _ in species_list]
    table = []

    with open(args.genetree, "r") as f:
        # reads multiple gene tree line by line gene tree
        for line in f:
            # Remove empty NHX features that can be produced by TreeBest but break ete3
            line = line.replace('[&&NHX]', '')

            # reads single gene tree
            genetree = PhyloTree(line)
            leaves = genetree.get_leaf_names()

            leaves_parts = [_.split("_") for _ in leaves]
            for i, leaf_parts in enumerate(leaves_parts):
                if len(leaf_parts) != 2:
                    raise Exception(
                        "Leaf node '%s' is not in gene_species format" %
                        leaves[i])

            leaves_species = [_[1] for _ in leaves_parts]
            species_counter = collections.Counter(leaves_species)

            # Assign to ref_species the first element of species_list which
            # appears in a leaf node
            for ref_species in species_list:
                if ref_species in species_counter:
                    break
            else:
                raise Exception(
                    "None of the specified species was found in the GeneTree '%s'"
                    % line)

            # Find the gene of the (first) leaf node for the ref_species
            for leaf_parts in leaves_parts:
                if leaf_parts[1] == ref_species:
                    species_counter['gene'] = leaf_parts[0]
                    break

            table.append(species_counter)

    colList = ["gene"] + species_list
    printTSV(table, colList)
예제 #3
0
def yes_choice(tree_file_name, gene, algae_choice):
    t = PhyloTree(tree_file_name)
    R = t.get_midpoint_outgroup()
    t.set_outgroup(R)
    gene_names = t.get_leaf_names()
    if algae_choice[0] == "y":
        print("\nFirst, let's define the algae clade.")
        algae_list = clade_to_tree(t)
    else:
        algae_list = []
    outlier_choice = raw_input(
        "\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)"
    )
    if outlier_choice[0] == "y":
        print(
            "\nLet's define the outlier group. \nFirst you can add any spceices that are in a monophyletic clade"
        )
        outlier_list = clade_to_tree(t)
        other_copies = raw_input(
            "If there are other genes in the outlier group, enter them here, separated by a space, or else enter n."
        )
        if other_copies != "n":
            other_list = other_copies.split(" ")
            outlier_list = outlier_list + other_list
    else:
        outlier_list = []
    print(
        "\nSelect one monophyletic family to define. \nYou will have a later chance to split this family again if needed."
    )
    group_list = clade_to_tree(t)
    ###tree1
    cut_list = [i for i in gene_names if i not in group_list]
    cut_list = cut_list + algae_list + outlier_list
    gene1 = yesMake(cut_list, gene, tree_file_name)
    ###tree2
    cut_list1 = [i for i in gene_names if i not in cut_list]
    cut_list1 = cut_list1 + algae_list + outlier_list
    gene2 = yesMake(cut_list1, gene1, tree_file_name)
    with open(sys.argv[2], "r") as f:
        todo_list = [line.rstrip() for line in f]
    todo_list = [i for i in todo_list if i != gene]
    todo_list.append(gene1)
    todo_list.append(gene2)
    with open(sys.argv[2], "w") as todo:
        for i in todo_list:
            todo.write(i + "\n")
예제 #4
0
def yes_choice(tree_file_name, gene, algae_choice):
	t=PhyloTree(tree_file_name)
	R = t.get_midpoint_outgroup()
	t.set_outgroup(R)
	gene_names = t.get_leaf_names()
	if algae_choice[0] == "y":
		print("\nFirst, let's define the algae clade.")
		algae_list = clade_to_tree(t)
	else:
		algae_list = []
	outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)")
	if outlier_choice[0] == "y":
		print("\nLet's define the outlier group. \nFirst you can add any spceices that are in a monophyletic clade")
		outlier_list = clade_to_tree(t)
		other_copies = raw_input("If there are other genes in the outlier group, enter them here, separated by a space, or else enter n.")
		if other_copies != "n":
			other_list = other_copies.split(" ")
			outlier_list = outlier_list + other_list
	else:
		outlier_list=[]
	print("\nSelect one monophyletic family to define. \nYou will have a later chance to split this family again if needed.")
	group_list = clade_to_tree(t)
	###tree1
	cut_list = [i for i in gene_names if i not in group_list]
	cut_list = cut_list + algae_list + outlier_list
	gene1 = yesMake(cut_list, gene, tree_file_name)
	###tree2
	cut_list1 = [i for i in gene_names if i not in cut_list]
	cut_list1 = cut_list1 + algae_list + outlier_list
	gene2 = yesMake(cut_list1, gene1, tree_file_name)
	with open(sys.argv[2], "r") as f:
		todo_list=[line.rstrip() for line in f]
	todo_list=[i for i in todo_list if i != gene]
	todo_list.append(gene1)
	todo_list.append(gene2)
	with open(sys.argv[2], "w") as todo:
		for i in todo_list:
			todo.write(i+"\n")
예제 #5
0
def main():
    usage = "usage: %prog --genetree <genetree-file> --speciestree <speciestree-file> [options]"
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('--genetree', help='GeneTree in nhx format')
    parser.add_option('--out_format',
                      type='string',
                      default='tabular',
                      help='Choose output format')
    parser.add_option('--filters', default='', help='Filter families')

    options, args = parser.parse_args()

    if options.genetree is None:
        parser.error(
            "--genetree option must be specified, GeneTree in nhx format")

    with open(options.genetree, 'r') as f:
        contents = f.read()

    # Remove empty NHX features that can be produced by TreeBest but break ete3
    contents = contents.replace('[&&NHX]', '')
    # reads single gene tree
    genetree = PhyloTree(contents)

    leaves_list = genetree.get_leaf_names()
    # Genetree nodes are required to be in gene_species format
    leaves_list = [_ for _ in leaves_list if '_' in _]

    species_list = [_.split("_")[1] for _ in leaves_list]

    species_dict = {}
    for species in species_list:
        count = "one"
        if species in species_dict:
            count = "many"
        species_dict[species] = count

    homologies = {
        'one-to-one': [],
        'one-to-many': [],
        'many-to-one': [],
        'many-to-many': [],
        'paralogs': []
    }

    # stores relevant homology types in dict
    for i, leaf1 in enumerate(leaves_list):
        for leaf2 in leaves_list[i + 1:]:
            id1 = leaf1.split(":")[1] if ":" in leaf1 else leaf1
            id2 = leaf2.split(":")[1] if ":" in leaf2 else leaf2
            species1 = id1.split("_")[1]
            species2 = id2.split("_")[1]
            if species1 == species2:
                homology_type = 'paralogs'
            else:
                homology_type = species_dict[species1] + "-to-" + species_dict[
                    species2]
            homologies[homology_type].append((id1, id2))

    options.filters = options.filters.split(",")

    if options.out_format == 'tabular':
        for homology_type, homologs_list in homologies.items():
            # checks if homology type is in filter
            if homology_type in options.filters:
                for (gene1, gene2) in homologs_list:
                    print("%s\t%s\t%s" % (gene1, gene2, homology_type))
    elif options.out_format == 'csv':
        print_family = True
        for homology_type, homologs_list in homologies.items():
            if homologs_list and homology_type not in options.filters:
                print_family = False
                break

        # prints family if homology type is not found in filter
        if print_family:
            print(','.join(leaves_list))
def pre_prune(gene):
	full_tree=PhyloTree(gene+"/"+gene+".3.fa.tre")
	gene_names=full_tree.get_leaf_names()
	m=100
	start_gene="{}_all{}".format(gene,str(m))
	os.system("mkdir {}".format(start_gene))
	full_tree.write(format=1, outfile="{}/{}.3.fa.tre".format(start_gene,start_gene))
	m=m+1
	l=[start_gene]
	for item in l:
		full_tree=PhyloTree("{}/{}.3.fa.tre".format(item,item))
		view_rooted_tree(full_tree)
		print("Tree for {}".format(item))
		c=raw_input("Split off a monophyletic gene copy? (y/n)")
		if c[0] == "y":
			algae_choice = raw_input("\nIs there an algae group that is sister to all shown families? (y/n)")
			outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)")
		while c[0]=="y":
			if algae_choice[0] == "y":
				print("\nFirst, let's define the algae clade.")
				algae_list = clade_to_tree(full_tree)
			else:
				algae_list = []
			if outlier_choice[0] == "y":
				print("\nLet's define the outlier group. ")
				outlier_list = []
				out_choice = raw_input("\nIs there a monophyletic clade in the outlier group? (y/n)")
				while out_choice[0] == "y":
					outlier_list2 = clade_to_tree(full_tree)
					outlier_list = outlier_list + outlier_list2
					out_choice = raw_input("\nIs there another monopyletic clade to add to the outlier group? (y/n)")
				other_choice = raw_input("Are there additional genes in the outlier group? (y/n)")
				while other_choice[0] == "y":
					other_copies = raw_input("\nEnter genes to include, separated by a space. Enter only up to ten genes at a time.")
					try:
						other_list = other_copies.split(" ")
						outlier_list = outlier_list + other_list
					except ValueError:
						other_choice = raw_input("\nAt least one gene is not found on the tree. Reenter genes? y/n")
					other_choice = raw_input("Are there more genes to enter? (y/n)")
			else:
				outlier_list=[]
			b="{}_all{}".format(gene, str(m))
			l.append(b)
			tree1=PhyloTree("{}/{}.3.fa.tre".format(item,item))
			R=tree1.get_midpoint_outgroup()
			tree1.set_outgroup(R)
			print("\nFor the monophyletic gene copy:")
			group_list=clade_to_tree(tree1)
			group_list=group_list + algae_list + outlier_list
			gene_names=tree1.get_leaf_names()
			if len(group_list)==len(gene_names):
				c1=raw_input("\nList includes all copies on tree.\nMake gene with all copies? (y/n)")
				if c1=="y":
					c="n"
				else:
					print("\nGroup crosses root. Unable to make group.\nChoose new group.")
					c="y"
			else:
				cut_list=[i for i in gene_names if i not in group_list]
				cut_list = cut_list + algae_list + outlier_list
				os.system("mkdir {}".format(b))
				tree2=PhyloTree("{}/{}.3.fa.tre".format(item,item))
				R=tree2.get_midpoint_outgroup()
				tree2.set_outgroup(R)
				tree2.prune(group_list,preserve_branch_length=True)
				tree2.write(format=1, outfile="{}/{}.3.fa.tre".format(b,b))
				tree1.prune(cut_list,preserve_branch_length=True)
				tree1.write(format=1, outfile="{}/{}.3.fa.tre".format(item,item))
				m=m+1
				print ("\nTree now looks like this.")
				view_rooted_tree(tree1)
				c=raw_input("Split off a monophyletic clade? (y/n)")
				if c[0] == "y":
					algae_choice = raw_input("\nIs there an algae group that is sister to all shown families? (y/n)")
					outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)")

	with open(sys.argv[1], "a") as p:
		for i in l:
			p.write(i+"\n")
예제 #7
0
            EventSummary.append("event(%i,duplication)" %(n.S))
    else:
        n.Ev = "S"
    logger.debug("name: %s",n.name)
    logger.debug("S: %s",n.S)
    logger.debug("Ev: %s",n.Ev)
    logger.debug("ND: %s",n.ND)
    i+=1


EventsFile = OutPrefixName + ".events.txt"
with open(EventsFile,"w") as File:
        File.write("\n".join(EventSummary)+"\n")


recon_tree.prune(genetree.get_leaf_names(),preserve_branch_length=True)

i=0
node_2events_and_sp = {}
for n in recon_tree.traverse("postorder"):
    n.ND = i
    node_2events_and_sp[n.ND]={"S": n.S, "Ev":n.Ev}
    i+=1

logger.debug(node_2events_and_sp)


i=0
for n in genetree.traverse("postorder"):
    n.ND=i
    n.S=node_2events_and_sp[n.ND]["S"]
예제 #8
0
def pre_prune(gene):
    full_tree = PhyloTree(gene + "/" + gene + ".3.fa.tre")
    gene_names = full_tree.get_leaf_names()
    m = 100
    start_gene = "{}_all{}".format(gene, str(m))
    os.system("mkdir {}".format(start_gene))
    full_tree.write(format=1,
                    outfile="{}/{}.3.fa.tre".format(start_gene, start_gene))
    m = m + 1
    l = [start_gene]
    for item in l:
        full_tree = PhyloTree("{}/{}.3.fa.tre".format(item, item))
        view_rooted_tree(full_tree)
        print("Tree for {}".format(item))
        c = raw_input("Split off a monophyletic gene copy? (y/n)")
        if c[0] == "y":
            algae_choice = raw_input(
                "\nIs there an algae group that is sister to all shown families? (y/n)"
            )
            outlier_choice = raw_input(
                "\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)"
            )
        while c[0] == "y":
            if algae_choice[0] == "y":
                print("\nFirst, let's define the algae clade.")
                algae_list = clade_to_tree(full_tree)
            else:
                algae_list = []
            if outlier_choice[0] == "y":
                print("\nLet's define the outlier group. ")
                outlier_list = []
                out_choice = raw_input(
                    "\nIs there a monophyletic clade in the outlier group? (y/n)"
                )
                while out_choice[0] == "y":
                    outlier_list2 = clade_to_tree(full_tree)
                    outlier_list = outlier_list + outlier_list2
                    out_choice = raw_input(
                        "\nIs there another monopyletic clade to add to the outlier group? (y/n)"
                    )
                other_choice = raw_input(
                    "Are there additional genes in the outlier group? (y/n)")
                while other_choice[0] == "y":
                    other_copies = raw_input(
                        "\nEnter genes to include, separated by a space. Enter only up to ten genes at a time."
                    )
                    try:
                        other_list = other_copies.split(" ")
                        outlier_list = outlier_list + other_list
                    except ValueError:
                        other_choice = raw_input(
                            "\nAt least one gene is not found on the tree. Reenter genes? y/n"
                        )
                    other_choice = raw_input(
                        "Are there more genes to enter? (y/n)")
            else:
                outlier_list = []
            b = "{}_all{}".format(gene, str(m))
            l.append(b)
            tree1 = PhyloTree("{}/{}.3.fa.tre".format(item, item))
            R = tree1.get_midpoint_outgroup()
            tree1.set_outgroup(R)
            print("\nFor the monophyletic gene copy:")
            group_list = clade_to_tree(tree1)
            group_list = group_list + algae_list + outlier_list
            gene_names = tree1.get_leaf_names()
            if len(group_list) == len(gene_names):
                c1 = raw_input(
                    "\nList includes all copies on tree.\nMake gene with all copies? (y/n)"
                )
                if c1 == "y":
                    c = "n"
                else:
                    print(
                        "\nGroup crosses root. Unable to make group.\nChoose new group."
                    )
                    c = "y"
            else:
                cut_list = [i for i in gene_names if i not in group_list]
                cut_list = cut_list + algae_list + outlier_list
                os.system("mkdir {}".format(b))
                tree2 = PhyloTree("{}/{}.3.fa.tre".format(item, item))
                R = tree2.get_midpoint_outgroup()
                tree2.set_outgroup(R)
                tree2.prune(group_list, preserve_branch_length=True)
                tree2.write(format=1, outfile="{}/{}.3.fa.tre".format(b, b))
                tree1.prune(cut_list, preserve_branch_length=True)
                tree1.write(format=1,
                            outfile="{}/{}.3.fa.tre".format(item, item))
                m = m + 1
                print("\nTree now looks like this.")
                view_rooted_tree(tree1)
                c = raw_input("Split off a monophyletic clade? (y/n)")
                if c[0] == "y":
                    algae_choice = raw_input(
                        "\nIs there an algae group that is sister to all shown families? (y/n)"
                    )
                    outlier_choice = raw_input(
                        "\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)"
                    )

    with open(sys.argv[1], "a") as p:
        for i in l:
            p.write(i + "\n")
def process_tree(treepath):
    ''' processes a tree to extract orthology relationships between target taxid and the rest
     of species, organized by orthology type and species code '''
    treepath = str(treepath)
    treepath = treepath.rstrip()
    t = PhyloTree(treepath, sp_naming_function=get_species)
    treefile = os.path.basename(treepath)
    t.dist = 0

    outgroup = t.get_midpoint_outgroup()
    try:
        t.set_outgroup(outgroup)
        t.standardize()
    except:
        if args.pairs_table:
            if len(t) == 1:
                sys.stderr.write(treefile + 'len(t) == 1' + '\n')
                return ([], [])
                #return (['aa', 'aa'] ,[['aa', 'aa']])

            else:
                sys.stderr.write(treefile + 'len(t) != 1' + '\n')
                l = t.get_leaf_names()
                r = l[0]
                t.set_outgroup(r)
                pass
                #return ([],[])
                #return  (['None', 'None'] ,[['None', 'None']])
        else:
            if len(t) == 1:
                sys.stderr.write(treefile + 'len(t) == 1' + '\n')
                return []
            else:
                sys.stderr.write(treefile + 'len(t) != 1' + '\n')
                return []

    names = {}
    for leaf in t:
        try:
            sp = str(leaf.name.split('.')[0])
            leaf.taxid = str(sp)
            sci_name = ncbi.get_taxid_translator([sp])
            names[sp] = sci_name[int(sp)]

        except:
            names[sp] = ''

        if args.conv_table:
            try:
                good_name = "%s" % (conversion[leaf.name][0])
            except:
                good_name = leaf.name
            leaf.good_name = good_name

    node2content = t.get_cached_content()
    target_species = set([target_taxid])

    def is_sp_specific(_node):
        _species = set([_leaf.species for _leaf in node2content[_node]])
        if not (_species - target_species):
            return True
        return False

    #traverse only target taxid leaves
    if collapse == 'yes':
        for n in t.get_leaves(is_leaf_fn=is_sp_specific):
            if n.children:
                for ch in n.get_children():
                    ch.detach()
                n.taxid = target_taxid
                n.name = "{%s}" % ('|'.join(
                    [_lf.name for _lf in node2content[n]]))
                if args.conv_table:
                    n.good_name = "{%s}" % ('|'.join(
                        [_lf.good_name for _lf in node2content[n]]))

    all_ortholgs_tree = []
    all_ortholgs_pairs = []
    event_lines = []

    for ev in t.get_descendant_evol_events():
        if ev.etype == "S":
            source_seqs = ev.node.children[0]
            ortho_seqs = ev.node.children[1]

            if target_taxid:
                sp_1 = set()
                for leaf in source_seqs:
                    sp_1.add(leaf.taxid)
                sp_2 = set()
                for leaf in ortho_seqs:
                    sp_2.add(leaf.taxid)

                if str(target_taxid) in sp_1:
                    source_seqs, ortho_seqs = source_seqs, ortho_seqs
                elif str(target_taxid) in sp_2:
                    source_seqs, ortho_seqs = ortho_seqs, source_seqs
                else:
                    continue

            if args.conv_table:
                co_orthologs = [leaf.good_name for leaf in source_seqs]
                co_orthologs.sort()
            else:
                co_orthologs = [leaf.name for leaf in source_seqs]
                co_orthologs.sort()

            orthologs = defaultdict(set)
            for leaf in ortho_seqs:
                sp = str(leaf.name.split('.')[0])
                if args.conv_table:
                    orthologs[sp].add(leaf.good_name)
                else:
                    orthologs[sp].add(leaf.name)

            if len(source_seqs) == 1:
                _otype = "one-to-"
            else:
                _otype = "many-to-"

            for sp, orth in orthologs.items():
                if len(orth) == 1:
                    otype = _otype + "one"
                else:
                    otype = _otype + "many"

                event_lines.append('\t'.join([
                    ','.join(co_orthologs), otype,
                    str(sp), ','.join(sorted(orth)), treefile, names[sp], '\n'
                ]))

            if args.pairs_table:

                source_seqs_names = []
                ortho_seqs_names = []

                for node in source_seqs:
                    for leaf in node:
                        if args.conv_table:
                            name = leaf.good_name
                        else:
                            name = leaf.name
                        source_seqs_names.append(name)

                for node in ortho_seqs:
                    for leaf in node:
                        if args.conv_table:
                            name = leaf.good_name
                        else:
                            name = leaf.name
                        ortho_seqs_names.append(name)

                all_ortholgs_node = itertools.product(source_seqs_names,
                                                      ortho_seqs_names)
                all_ortholgs_tree.append(all_ortholgs_node)

                for node in all_ortholgs_tree:
                    for pair in node:
                        all_ortholgs_pairs.append(pair)

                #return (event_lines, all_ortholgs_pairs)

    if args.pairs_table:
        return (event_lines, all_ortholgs_pairs)
    else:
        return (event_lines)