예제 #1
0
def extract_clades(newick_file, processed_newick_out=None):
    """ the outer logic for tree splitting """
    # preprocess tree
    print("Pre-processing tree ({})".format(newick_file))
    tree = PhyloTree(newick_file)
    R = tree.get_midpoint_outgroup()
    tree.set_outgroup(R)
    tree.ladderize()
    tree.convert_to_ultrametric()
    if (processed_newick_out is not None):
        tree.write(format=1, outfile=processed_newick_out)
    # calculate clades
    print("Calling clades ({})".format(newick_file))

    def get_branch_length(node):
        for l in node:
            return l.get_distance(node)

    len_tree = len(tree)
    dist_tree = get_branch_length(tree)

    def condition_discard(node, tree):
        return (len(node) < 3)

    def condition_ok(node, tree):
        return len(node) < max(10, len_tree / 50)

    branches = get_pruned_branch(tree, tree, condition_discard, condition_ok,
                                 [])
    clades = {}
    for i, branch in enumerate(
            sorted(branches, key=lambda nodes: -1 * len(nodes))):
        clades[str(i + 1)] = [node.name for node in branch]
    return clades
예제 #2
0
def yes_choice(tree_file_name, gene, algae_choice):
    t = PhyloTree(tree_file_name)
    R = t.get_midpoint_outgroup()
    t.set_outgroup(R)
    gene_names = t.get_leaf_names()
    if algae_choice[0] == "y":
        print("\nFirst, let's define the algae clade.")
        algae_list = clade_to_tree(t)
    else:
        algae_list = []
    outlier_choice = raw_input(
        "\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)"
    )
    if outlier_choice[0] == "y":
        print(
            "\nLet's define the outlier group. \nFirst you can add any spceices that are in a monophyletic clade"
        )
        outlier_list = clade_to_tree(t)
        other_copies = raw_input(
            "If there are other genes in the outlier group, enter them here, separated by a space, or else enter n."
        )
        if other_copies != "n":
            other_list = other_copies.split(" ")
            outlier_list = outlier_list + other_list
    else:
        outlier_list = []
    print(
        "\nSelect one monophyletic family to define. \nYou will have a later chance to split this family again if needed."
    )
    group_list = clade_to_tree(t)
    ###tree1
    cut_list = [i for i in gene_names if i not in group_list]
    cut_list = cut_list + algae_list + outlier_list
    gene1 = yesMake(cut_list, gene, tree_file_name)
    ###tree2
    cut_list1 = [i for i in gene_names if i not in cut_list]
    cut_list1 = cut_list1 + algae_list + outlier_list
    gene2 = yesMake(cut_list1, gene1, tree_file_name)
    with open(sys.argv[2], "r") as f:
        todo_list = [line.rstrip() for line in f]
    todo_list = [i for i in todo_list if i != gene]
    todo_list.append(gene1)
    todo_list.append(gene2)
    with open(sys.argv[2], "w") as todo:
        for i in todo_list:
            todo.write(i + "\n")
예제 #3
0
def yes_choice(tree_file_name, gene, algae_choice):
	t=PhyloTree(tree_file_name)
	R = t.get_midpoint_outgroup()
	t.set_outgroup(R)
	gene_names = t.get_leaf_names()
	if algae_choice[0] == "y":
		print("\nFirst, let's define the algae clade.")
		algae_list = clade_to_tree(t)
	else:
		algae_list = []
	outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)")
	if outlier_choice[0] == "y":
		print("\nLet's define the outlier group. \nFirst you can add any spceices that are in a monophyletic clade")
		outlier_list = clade_to_tree(t)
		other_copies = raw_input("If there are other genes in the outlier group, enter them here, separated by a space, or else enter n.")
		if other_copies != "n":
			other_list = other_copies.split(" ")
			outlier_list = outlier_list + other_list
	else:
		outlier_list=[]
	print("\nSelect one monophyletic family to define. \nYou will have a later chance to split this family again if needed.")
	group_list = clade_to_tree(t)
	###tree1
	cut_list = [i for i in gene_names if i not in group_list]
	cut_list = cut_list + algae_list + outlier_list
	gene1 = yesMake(cut_list, gene, tree_file_name)
	###tree2
	cut_list1 = [i for i in gene_names if i not in cut_list]
	cut_list1 = cut_list1 + algae_list + outlier_list
	gene2 = yesMake(cut_list1, gene1, tree_file_name)
	with open(sys.argv[2], "r") as f:
		todo_list=[line.rstrip() for line in f]
	todo_list=[i for i in todo_list if i != gene]
	todo_list.append(gene1)
	todo_list.append(gene2)
	with open(sys.argv[2], "w") as todo:
		for i in todo_list:
			todo.write(i+"\n")
예제 #4
0
    def calculate_nodes(self):
        """Method to calculate the different internal node scores
        for a given calculus method, and store those values both in
        a dictionary (if the user wants to) and in an instance
        of a processed tree.
        """
        try:
            tree = PhyloTree(self.tree_in,
                             alignment=self.align_in,
                             alg_format="fasta")
            md = tree.get_midpoint_outgroup()
            tree.set_outgroup(md)
            leaf_deleting_list = set()
            if self.position_matrix == None:
                uniprot_hit_hash, leaf_deleting_list = fp.retrieve_features(
                    self.study_features, self.table_info, self.min_eval,
                    self.uniprot_info)
                self.position_matrix = fp.get_positions_matrix(
                    uniprot_hit_hash, tree
                )  # If we want to update the features, we have to delete the position matrix (with update method)
            for leaf in tree.iter_leaves():
                if leaf.name in leaf_deleting_list:
                    leaf.delete()

            node_number = 0
            node_scores = {}
            node_haplotypes = {}
            node_haplotype_matrices = {}
            node_haplotype_logos = {}
            for index, node in enumerate(tree.traverse("preorder")):
                node._nid = index
                if node.is_leaf() == False:
                    node_sequence_matrix = fp.annotated_sequence_extractor(
                        node, self.position_matrix, self.differentiate_gaps)

                    node_score = round(
                        fp.calculate_node_score(node_sequence_matrix,
                                                self.calc_alg), 2)
                    node.add_feature("node_score", node_score)
                    node_scores[node_number] = node_score

                    node_haplotype = fp.haplotype_parse(node_sequence_matrix)
                    node.add_feature("node_haplotype", node_haplotype)
                    node_haplotypes[node_number] = node_haplotype

                    if self.compute_logos == "Y":
                        node_haplotype_matrix = fp.haplotype_matrix_calculator(
                            node_sequence_matrix)
                        node.add_feature("node_haplotype_matrix",
                                         node_haplotype_matrix)
                        node_haplotype_matrices[
                            node_number] = node_haplotype_matrix
                        if node_haplotype_matrix is not None:
                            node_haplotype_logo = logomaker.Logo(
                                node_haplotype_matrix,
                                color_scheme="dmslogo_funcgroup",
                                show_spines=False)
                            node_haplotype_logo = node_haplotype_logo.fig
                        else:
                            node_haplotype_logo = None
                        node.add_feature("node_haplotype_logo",
                                         node_haplotype_logo)
                        node_haplotype_logos[node_number] = node_haplotype_logo

                    node_number += 1

            self.processed_tree = tree
            self.node_scores = node_scores
            self.node_haplotypes = node_haplotypes
            self.node_haplotype_matrices = node_haplotype_matrices
            self.node_haplotype_logos = node_haplotype_logos

        except:
            sys.stderr.write("Error at calculating nodes.\n")
            sys.exit(1)

        return
예제 #5
0
"""
from ete3 import PhyloTree
sample_list = [
    r"D:\Users\suuser\Desktop\ADRB2_Trimmed.txt",
    r"D:\Users\suuser\Desktop\ACM2_Trimmed.txt",
    r"D:\Users\suuser\Desktop\AA2AR_Trimmed.txt",
    r"D:\Users\suuser\Desktop\OPRM_Trimmed.txt"
]
file_name = ["ADRB2", "ACM2", "AA2AR", "OPRM"]
for sample in sample_list:
    idx = sample_list.index(sample)
    file = file_name[idx]
    t = open(sample, "r")
    line = t.readline()
    tree = PhyloTree(line)
    R = tree.get_midpoint_outgroup()
    tree.set_outgroup(R)  #for rooting of the tree
    for node in tree.traverse():
        leaves = node.get_children()
        #print (leaves)
        if len(leaves) < 2 or node.is_leaf():
            continue
        #leaf1=leaves[0].get_leaves()
        #print (leaf1)
        leaf2 = leaves[1].get_leaves()
        for i in leaf2:
            leaf_node = str(i).split("|")[-1]
            print(leaf_node)
            if leaf_node == "9606":
                node.swap_children()
                break
def pre_prune(gene):
	full_tree=PhyloTree(gene+"/"+gene+".3.fa.tre")
	gene_names=full_tree.get_leaf_names()
	m=100
	start_gene="{}_all{}".format(gene,str(m))
	os.system("mkdir {}".format(start_gene))
	full_tree.write(format=1, outfile="{}/{}.3.fa.tre".format(start_gene,start_gene))
	m=m+1
	l=[start_gene]
	for item in l:
		full_tree=PhyloTree("{}/{}.3.fa.tre".format(item,item))
		view_rooted_tree(full_tree)
		print("Tree for {}".format(item))
		c=raw_input("Split off a monophyletic gene copy? (y/n)")
		if c[0] == "y":
			algae_choice = raw_input("\nIs there an algae group that is sister to all shown families? (y/n)")
			outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)")
		while c[0]=="y":
			if algae_choice[0] == "y":
				print("\nFirst, let's define the algae clade.")
				algae_list = clade_to_tree(full_tree)
			else:
				algae_list = []
			if outlier_choice[0] == "y":
				print("\nLet's define the outlier group. ")
				outlier_list = []
				out_choice = raw_input("\nIs there a monophyletic clade in the outlier group? (y/n)")
				while out_choice[0] == "y":
					outlier_list2 = clade_to_tree(full_tree)
					outlier_list = outlier_list + outlier_list2
					out_choice = raw_input("\nIs there another monopyletic clade to add to the outlier group? (y/n)")
				other_choice = raw_input("Are there additional genes in the outlier group? (y/n)")
				while other_choice[0] == "y":
					other_copies = raw_input("\nEnter genes to include, separated by a space. Enter only up to ten genes at a time.")
					try:
						other_list = other_copies.split(" ")
						outlier_list = outlier_list + other_list
					except ValueError:
						other_choice = raw_input("\nAt least one gene is not found on the tree. Reenter genes? y/n")
					other_choice = raw_input("Are there more genes to enter? (y/n)")
			else:
				outlier_list=[]
			b="{}_all{}".format(gene, str(m))
			l.append(b)
			tree1=PhyloTree("{}/{}.3.fa.tre".format(item,item))
			R=tree1.get_midpoint_outgroup()
			tree1.set_outgroup(R)
			print("\nFor the monophyletic gene copy:")
			group_list=clade_to_tree(tree1)
			group_list=group_list + algae_list + outlier_list
			gene_names=tree1.get_leaf_names()
			if len(group_list)==len(gene_names):
				c1=raw_input("\nList includes all copies on tree.\nMake gene with all copies? (y/n)")
				if c1=="y":
					c="n"
				else:
					print("\nGroup crosses root. Unable to make group.\nChoose new group.")
					c="y"
			else:
				cut_list=[i for i in gene_names if i not in group_list]
				cut_list = cut_list + algae_list + outlier_list
				os.system("mkdir {}".format(b))
				tree2=PhyloTree("{}/{}.3.fa.tre".format(item,item))
				R=tree2.get_midpoint_outgroup()
				tree2.set_outgroup(R)
				tree2.prune(group_list,preserve_branch_length=True)
				tree2.write(format=1, outfile="{}/{}.3.fa.tre".format(b,b))
				tree1.prune(cut_list,preserve_branch_length=True)
				tree1.write(format=1, outfile="{}/{}.3.fa.tre".format(item,item))
				m=m+1
				print ("\nTree now looks like this.")
				view_rooted_tree(tree1)
				c=raw_input("Split off a monophyletic clade? (y/n)")
				if c[0] == "y":
					algae_choice = raw_input("\nIs there an algae group that is sister to all shown families? (y/n)")
					outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)")

	with open(sys.argv[1], "a") as p:
		for i in l:
			p.write(i+"\n")
예제 #7
0
# libraries
from ete3 import PhyloTree

# read tree from file
phy = PhyloTree("adar_hol.01.iqt.contree.newick")

# assign species names to tree
phy.set_species_naming_function(lambda node: node.name.split("_")[0])
for n in phy.get_leaves():
    print("node:", n.name, "Species name:", n.species)

# root tree
phy_outgroup = phy.get_midpoint_outgroup()
phy.set_outgroup(phy_outgroup)

# find evolutionary events
evev = phy.get_descendant_evol_events(sos_thr=0.9)

for ev in evev:
    if ev.etype == "S":
        print(ev.orthologs)

# find evolutionary events
evev = phy.get_descendant_evol_events(sos_thr=0.9)

# all events
for ev in evev:
    print(ev.etype, ','.join(ev.in_seqs), "<====>", ','.join(ev.out_seqs))

# all events involving either Hsap or Drer
fseqs = lambda slist: [
예제 #8
0
            N = AttrFace("name", fsize=34, fgcolor="black")
            N.margin_left = 20
            N.margin_right = 20
            faces.add_face_to_node(N, node, column=0)


t = sys.argv[1]
alg = sys.argv[2]
predic_table = sys.argv[3]
out_img_name = sys.argv[4]

#bilaterian_desc = ncbi.get_descendant_taxa('Bilateria', collapse_subspecies=True)
#print type(bilaterian_desc)

tree_upp = PhyloTree(newick=t, alignment=alg, alg_format="fasta")
R = tree_upp.get_midpoint_outgroup()
tree_upp.set_outgroup(R)

predict_name = {}
for line in open(predic_table):
    if line.rstrip() and not line.startswith("#"):
        prot_name = line.split("\t")[0]
        pred_name = line.split("\t")[5]
        predict_name[prot_name] = pred_name

ts = TreeStyle()
ts.show_leaf_name = False
ts.tree_width = 2000

ts.layout_fn = layout
         protein_annotations[s] = 'b'
     else:
         protein_annotations[s] = 'c'
 # load in the tree
 t = PhyloTree('temp/queryCOG.hmmhits.fasta.aln.trimal.treefile')
 # try to root with largest paralog clade
 try:
     m = 0
     for clade in t.get_monophyletic(values=["c", "b"],
                                     target_attr="annot"):
         if len([l for l in clade.get_leaves()]) > m:
             rooting_clade = clade
             m = len([l for l in clade.get_leaves()])
     t.set_outgroup(clade)
 except:
     midpoint = t.get_midpoint_outgroup()
     if midpoint: t.set_outgroup(midpoint)
 annotate_leaf_proteins(t)
 # find the clade with the mapped sequence
 for l in t.iter_descendants():
     if l.name == mapped_seq:
         clade = l
 n = 0
 while n == 0:
     if clade.up:
         annotations = [
             leaf.annot for sister in clade.get_sisters()
             for leaf in sister.get_leaves()
         ]
         counts = dict(Counter(annotations))
         if 'c' not in counts.keys():
예제 #10
0
def pre_prune(gene):
    full_tree = PhyloTree(gene + "/" + gene + ".3.fa.tre")
    gene_names = full_tree.get_leaf_names()
    m = 100
    start_gene = "{}_all{}".format(gene, str(m))
    os.system("mkdir {}".format(start_gene))
    full_tree.write(format=1,
                    outfile="{}/{}.3.fa.tre".format(start_gene, start_gene))
    m = m + 1
    l = [start_gene]
    for item in l:
        full_tree = PhyloTree("{}/{}.3.fa.tre".format(item, item))
        view_rooted_tree(full_tree)
        print("Tree for {}".format(item))
        c = raw_input("Split off a monophyletic gene copy? (y/n)")
        if c[0] == "y":
            algae_choice = raw_input(
                "\nIs there an algae group that is sister to all shown families? (y/n)"
            )
            outlier_choice = raw_input(
                "\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)"
            )
        while c[0] == "y":
            if algae_choice[0] == "y":
                print("\nFirst, let's define the algae clade.")
                algae_list = clade_to_tree(full_tree)
            else:
                algae_list = []
            if outlier_choice[0] == "y":
                print("\nLet's define the outlier group. ")
                outlier_list = []
                out_choice = raw_input(
                    "\nIs there a monophyletic clade in the outlier group? (y/n)"
                )
                while out_choice[0] == "y":
                    outlier_list2 = clade_to_tree(full_tree)
                    outlier_list = outlier_list + outlier_list2
                    out_choice = raw_input(
                        "\nIs there another monopyletic clade to add to the outlier group? (y/n)"
                    )
                other_choice = raw_input(
                    "Are there additional genes in the outlier group? (y/n)")
                while other_choice[0] == "y":
                    other_copies = raw_input(
                        "\nEnter genes to include, separated by a space. Enter only up to ten genes at a time."
                    )
                    try:
                        other_list = other_copies.split(" ")
                        outlier_list = outlier_list + other_list
                    except ValueError:
                        other_choice = raw_input(
                            "\nAt least one gene is not found on the tree. Reenter genes? y/n"
                        )
                    other_choice = raw_input(
                        "Are there more genes to enter? (y/n)")
            else:
                outlier_list = []
            b = "{}_all{}".format(gene, str(m))
            l.append(b)
            tree1 = PhyloTree("{}/{}.3.fa.tre".format(item, item))
            R = tree1.get_midpoint_outgroup()
            tree1.set_outgroup(R)
            print("\nFor the monophyletic gene copy:")
            group_list = clade_to_tree(tree1)
            group_list = group_list + algae_list + outlier_list
            gene_names = tree1.get_leaf_names()
            if len(group_list) == len(gene_names):
                c1 = raw_input(
                    "\nList includes all copies on tree.\nMake gene with all copies? (y/n)"
                )
                if c1 == "y":
                    c = "n"
                else:
                    print(
                        "\nGroup crosses root. Unable to make group.\nChoose new group."
                    )
                    c = "y"
            else:
                cut_list = [i for i in gene_names if i not in group_list]
                cut_list = cut_list + algae_list + outlier_list
                os.system("mkdir {}".format(b))
                tree2 = PhyloTree("{}/{}.3.fa.tre".format(item, item))
                R = tree2.get_midpoint_outgroup()
                tree2.set_outgroup(R)
                tree2.prune(group_list, preserve_branch_length=True)
                tree2.write(format=1, outfile="{}/{}.3.fa.tre".format(b, b))
                tree1.prune(cut_list, preserve_branch_length=True)
                tree1.write(format=1,
                            outfile="{}/{}.3.fa.tre".format(item, item))
                m = m + 1
                print("\nTree now looks like this.")
                view_rooted_tree(tree1)
                c = raw_input("Split off a monophyletic clade? (y/n)")
                if c[0] == "y":
                    algae_choice = raw_input(
                        "\nIs there an algae group that is sister to all shown families? (y/n)"
                    )
                    outlier_choice = raw_input(
                        "\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)"
                    )

    with open(sys.argv[1], "a") as p:
        for i in l:
            p.write(i + "\n")
예제 #11
0
def process_tree(treepath):
    ''' processes a tree to extract orthology relationships between target taxid and the rest
     of species, organized by orthology type and species code '''
    treepath = str(treepath)
    treepath = treepath.rstrip()
    t = PhyloTree(treepath, sp_naming_function=get_species)
    # traverse all leaves in tree file and get taxid
    leaf_count = 0
    for leaf in t:
        leaf_count += 1
        tax = int(leaf.name.split(".", 1)[0])

        #get scientific name and convert taxid from int to str
        sci_name = names.get(tax)
        leaf.taxid = str(tax)

        #rename leaves names
        try:
            good_name = "%s" % (conversion[leaf.name][0])
        except:
            good_name = leaf.name

        good_name = re.sub("[ |\t,:)(;\n\]\[]+", "_", good_name)
        leaf.good_name = good_name

    #obtain cluster name from tree file path
    clus_name = os.path.split(treepath)[-1].replace(".fa.final_tree.nw", "")
    try:
        base_name = conversion[clus_name][0].replace('|', '_')
    except:
        base_name = clus_name[0]
    t.dist = 0

    #colapses plat specific
    node2content = t.get_cached_content()
    target_species = set([target_taxid])

    def is_sp_specific(_node):
        _species = set([_leaf.species for _leaf in node2content[_node]])
        if not (_species - target_species):
            return True
        return False

    #traverse only lamprey leaves
    if collapse == 'yes':
        for n in t.get_leaves(is_leaf_fn=is_sp_specific):
            if n.children:
                for ch in n.get_children():
                    ch.detach()
                n.taxid = target_taxid
                n.name = "%s" % ('|'.join(
                    [_lf.name for _lf in node2content[n]]))
                n.good_name = "{%s}" % ('|'.join(
                    [_lf.good_name for _lf in node2content[n]]))

    #set outgroup
    outgroup = t.get_midpoint_outgroup()
    try:
        t.set_outgroup(outgroup)
    except:
        if len(t) == 1:
            return
        else:
            raise

    node2content = t.get_cached_content()

    event_lines = []
    for ev in t.get_descendant_evol_events():
        if ev.etype == "S":

            source_seqs = node2content[ev.node.children[0]]
            ortho_seqs = node2content[ev.node.children[1]]

            sp_1 = set()
            for leaf in source_seqs:
                sp_1.add(leaf.taxid)
            sp_2 = set()
            for leaf in ortho_seqs:
                sp_2.add(leaf.taxid)

            if str(target_taxid) in sp_1:
                source_seqs, ortho_seqs = source_seqs, ortho_seqs
            elif str(target_taxid) in sp_2:
                source_seqs, ortho_seqs = ortho_seqs, source_seqs
            else:
                continue

            #co_orthologs is a list with lamprey seed in source_seqs
            co_orthologs = [
                leaf.good_name for leaf in source_seqs
                if leaf.taxid == str(target_taxid)
            ]
            co_orthologs.sort()

            #orthologs is a list of all ortho_seqs names
            orthologs = defaultdict(set)
            for leaf in ortho_seqs:
                sp = int(leaf.taxid)
                orthologs[sp].add(leaf.good_name)

            if len(co_orthologs) == 1:
                _otype = "one-to-"
            else:
                _otype = "many-to-"

            for sp, orth in orthologs.iteritems():
                if len(orth) == 1:
                    otype = _otype + "one"
                else:
                    otype = _otype + "many"

                event_lines.append('\t'.join([
                    ','.join(co_orthologs), otype,
                    str(sp), names[sp], ','.join(sorted(orth)), '\n'
                ]))
    return event_lines
예제 #12
0
def process_file(file):       
    ## extract index
    index = file.split('.')[0]
    
    ## create output directory
    index_dir = out_dir / index
    index_dir.mkdir(parents=True, exist_ok=True)
    
    ## perform local search against each database
    for file_db in list_files:
        search_output = index + '_' + file_db.replace('.fas','.gz')
        subprocess.check_output(path_diamond + ' blastp --quiet --threads 1 --db ' + str(db_dir / file_db.replace('.fas','.db')) + ' --max-target-seqs ' + str(max_per_species) + ' --query ' + str(Path('dir_step1') / file) + ' \
                --compress 1 --more-sensitive -e ' + str(evalue) + ' -o ' + str(index_dir / search_output) + ' --outfmt 6 qseqid sseqid qstart qend sstart cigar 2>&1', shell=True)
    
    ## get all DIAMOND output files
    p = index_dir.glob('*.gz')
    tmp_l = [x for x in p if x.is_file()]

    ## get all hits in a dict of list
    all_output = collections.defaultdict(list)
    for out_file in tmp_l:
        with gzip.open(out_file, mode="rt") as f:
            file_content = csv.reader(f, delimiter='	')
            for line in file_content:
                # save output
                all_output[line[0]].append(line[1:])     
    
    ## analyse BLAST hits
    nb_phylo    = 0
    nb_NO_phylo = 0
    nb_empty_ali  = 0
    all_alis = dict()
    no_phylo = dict()
    
    for prot in all_output:
        ## variable for reduced list of output
        reduced = list()
        
        ## get all species hits (initialise with query prot)
        ref_species = dict(all_species)
        ref_species[name_2_sp_phylip_seq[prot][0]] += 1
        all_hits = {prot}
        for ll in all_output[prot]:
            target = ll[0]
            target_sp = name_2_sp_phylip_seq[target][0]
            if target not in all_hits:
                ref_species[target_sp] += 1
                all_hits.add(target)
            # reduce output for pickle (convert all element to integers)
            reduced.append(tuple(int(x) for x in ll[:3]))    
        
        ## analyse species content
        nb_present, nb_dupli = analyse_species(ref_species)
        
        ## case phylogenetic analysis
        if nb_present > 1 and nb_dupli > 0:
            nb_phylo += 1
            min_start = math.inf
            max_end   = 0
            all_hits = dict()
            # get all hits for this prot
            for ll in all_output[prot]:
                target = ll[0]
                species = name_2_sp_phylip_seq[target][0]
                qu_start = int(ll[1]) -1
                qu_end   = int(ll[2]) -1
                ta_start = int(ll[3]) -1
                cigar    = ll[4]

                if target in all_hits:
                    # extract HSP and add to target seq
                    HSP = extract_HSP(name_2_sp_phylip_seq[target][2], ta_start, cigar)
                    all_hits[target] = all_hits[target][:qu_start] + HSP + all_hits[target][qu_end + 1:]
                    min_start, max_end = process_location(qu_start, qu_end, min_start, max_end) 
                
                else:
                    ref_species[species] += 1
                    # create target seq
                    all_hits[target] = '-' * len(name_2_sp_phylip_seq[prot][2])
                    # extract HSP
                    HSP = extract_HSP(name_2_sp_phylip_seq[target][2], ta_start, cigar)
                    all_hits[target] = all_hits[target][:qu_start] + HSP + all_hits[target][qu_end + 1:]
                    min_start, max_end = process_location(qu_start, qu_end, min_start, max_end)              
            
            # add query to hits if not there (it happens sometimes when many similar sequences from the same species)
            if prot not in all_hits:
                all_hits[prot] = name_2_sp_phylip_seq[prot][2]            
            
            # find good positions
            good_positions = get_positions(prot, all_hits, trim_thres)
            # save alignment
            if len(good_positions) == 0:
                nb_empty_ali += 1
            elif len(good_positions) < 4988:  ## FastTtree2 limitation (5000 per line)
                new_ali = [str(len(all_hits)) + '	' + str(len(good_positions))]
                for name, seq in all_hits.items():
                    trimed_seq = [seq[n] for n in range(len(seq)) if n in good_positions]
                    new_ali.append(name_2_sp_phylip_seq[name][1] + ''.join(trimed_seq))
                all_alis[prot] = '\n'.join(new_ali)
            else:
                # take only the 4988 first positions (longer alignments are very rare anyway)
                new_ali = [str(len(all_hits)) + '	4988']
                for name, seq in all_hits.items():
                    trimed_seq = [seq[n] for n in range(len(seq)) if n in good_positions][:4988]
                    new_ali.append(name_2_sp_phylip_seq[name][1] + ''.join(trimed_seq)) 
                all_alis[prot] = '\n'.join(new_ali)
                               
        ## case NO phylogenetic analysis
        else:
            nb_NO_phylo += 1
            # sort the list of names for further processing
            xx = list(all_hits)
            xx.sort()
            no_phylo[prot] = xx
    
        ## save reduced output
        all_output[prot] = tuple(reduced)
                
    ## convert DIAMOND output (keys to integers) and save it to file
    all_output = {int(x):t for x,t in all_output.items()}
    output_file = index + '_output.pic'
    utils.save_pickle(out_dir / 'dict_output' / output_file, all_output)

    ## save similarity_ortho groups to file
    blast_ortho_file = index + '_similarity_ortho.pic'
    utils.save_pickle(out_dir / 'dict_similarity_ortho' / blast_ortho_file, no_phylo)
    
    ## save all alignments to file
    name_ali_file = 'alis_' + index + '.phy'
    write_ali = open(out_dir / name_ali_file, 'w+')
    all_ref_prot = list()
    for ref_prot, ali in all_alis.items():
        write_ali.write(ali + '\n')
        all_ref_prot.append(ref_prot)
    write_ali.close()
    
    # free memory
    nb_alis = len(all_alis)
    all_alis   = None
    all_output = None
    
    ## deal with method
    if phylo_method == 'nj':
        insert = '-noml -nome'
    elif phylo_method == 'me':
        insert = '-noml'
    elif phylo_method == 'ml':
        insert = ''
    
    ## perform phylogenetic analyses and root trees
    all_trees  = dict()
    nb_pbm_tree = 0
    a = subprocess.check_output(path_fasttree + ' -quiet -nosupport -fastest -bionj -pseudo ' + insert + ' -n ' + str(nb_alis) + ' ' + str(Path(out_dir / name_ali_file)) + ' 2>&1', shell=True)
    a2 = a.strip().decode("utf-8")
    a3 = a2.split('\n')
    c = -1
    for line in a3:
        # case the line is in the form 'Ignored unknown character ...'
        if line.startswith('Ign'):
            pass
        else:
            c += 1
            if not line.startswith('('):
                nb_pbm_tree += 1            
                # security
                if nb_pbm_tree > 100:
                    sys.exit("\n            ERROR STEP 2: too many errors in phylogenetic analyses -> stopped\n\n")
            else:
                # import tree in ete3 and root it
                ete_tree = PhyloTree(line)
                mid = ete_tree.get_midpoint_outgroup()
                try:
                    ete_tree.set_outgroup(mid)
                except:
                    pass
                # get reference protein name
                prot = all_ref_prot[c]
                # save rooted tree
                all_trees[prot] = ete_tree.write()
    
    ## save trees to file
    tree_file = index + '_trees.pic'
    utils.save_pickle(out_dir / 'dict_trees' / tree_file, all_trees)
    
    ## clean directory
    # delete ali file
    Path.unlink(out_dir / name_ali_file)
    # delete Diamond outputs
    shutil.rmtree(index_dir)
    
    return [index, str(nb_phylo), str(nb_NO_phylo), str(nb_empty_ali), str(nb_pbm_tree)]
예제 #13
0

#load a tree and associated alignment
#treefile = '/home/cactuskid/Dropbox/IIB/mergeLineages/yuyo/restricted/hapAndeff/strcutres_and_tcoffeeset_aln_struct.phy_phyml_tree.txtlabels.txt'

folder = '/home/cactuskid/Dropbox/IIB/mergeLineages/yuyo/*/*labels.txt'
#folder = '/home/cactuskid/Dropbox/IIB/mergeLineages/yuyo/*/*/*labels.txt'

treefiles = glob.glob(folder)
for treefile in treefiles:
	print treefile
	colorSepcies = False
	#alg = '/home/cactuskid/Dropbox/IIB/mergeLineages/phylogeny/hybrid/merged_curate_aln.fasta'
	t = PhyloTree( treefile, sp_naming_function=None) #, alignment=alg, alg_format="fasta")
	# Calculate the midpoint node
	R = t.get_midpoint_outgroup()
	# and set it as tree outgroup
	t.set_outgroup(R)

	def save_obj(obj, name ):
	    with open( name + '.pkl', 'wb') as f:
	        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

	def load_obj(name ):
	    with open( name + '.pkl', 'r') as f:
	        return pickle.load(f)

	genedict = load_obj('genedict')
	speciescolors = load_obj('colors')
	red = Color('red')
	blue = Color('blue')
def process_tree(treepath):
    ''' processes a tree to extract orthology relationships between target taxid and the rest
     of species, organized by orthology type and species code '''
    treepath = str(treepath)
    treepath = treepath.rstrip()
    t = PhyloTree(treepath, sp_naming_function=get_species)
    treefile = os.path.basename(treepath)
    t.dist = 0

    outgroup = t.get_midpoint_outgroup()
    try:
        t.set_outgroup(outgroup)
        t.standardize()
    except:
        if args.pairs_table:
            if len(t) == 1:
                sys.stderr.write(treefile + 'len(t) == 1' + '\n')
                return ([], [])
                #return (['aa', 'aa'] ,[['aa', 'aa']])

            else:
                sys.stderr.write(treefile + 'len(t) != 1' + '\n')
                l = t.get_leaf_names()
                r = l[0]
                t.set_outgroup(r)
                pass
                #return ([],[])
                #return  (['None', 'None'] ,[['None', 'None']])
        else:
            if len(t) == 1:
                sys.stderr.write(treefile + 'len(t) == 1' + '\n')
                return []
            else:
                sys.stderr.write(treefile + 'len(t) != 1' + '\n')
                return []

    names = {}
    for leaf in t:
        try:
            sp = str(leaf.name.split('.')[0])
            leaf.taxid = str(sp)
            sci_name = ncbi.get_taxid_translator([sp])
            names[sp] = sci_name[int(sp)]

        except:
            names[sp] = ''

        if args.conv_table:
            try:
                good_name = "%s" % (conversion[leaf.name][0])
            except:
                good_name = leaf.name
            leaf.good_name = good_name

    node2content = t.get_cached_content()
    target_species = set([target_taxid])

    def is_sp_specific(_node):
        _species = set([_leaf.species for _leaf in node2content[_node]])
        if not (_species - target_species):
            return True
        return False

    #traverse only target taxid leaves
    if collapse == 'yes':
        for n in t.get_leaves(is_leaf_fn=is_sp_specific):
            if n.children:
                for ch in n.get_children():
                    ch.detach()
                n.taxid = target_taxid
                n.name = "{%s}" % ('|'.join(
                    [_lf.name for _lf in node2content[n]]))
                if args.conv_table:
                    n.good_name = "{%s}" % ('|'.join(
                        [_lf.good_name for _lf in node2content[n]]))

    all_ortholgs_tree = []
    all_ortholgs_pairs = []
    event_lines = []

    for ev in t.get_descendant_evol_events():
        if ev.etype == "S":
            source_seqs = ev.node.children[0]
            ortho_seqs = ev.node.children[1]

            if target_taxid:
                sp_1 = set()
                for leaf in source_seqs:
                    sp_1.add(leaf.taxid)
                sp_2 = set()
                for leaf in ortho_seqs:
                    sp_2.add(leaf.taxid)

                if str(target_taxid) in sp_1:
                    source_seqs, ortho_seqs = source_seqs, ortho_seqs
                elif str(target_taxid) in sp_2:
                    source_seqs, ortho_seqs = ortho_seqs, source_seqs
                else:
                    continue

            if args.conv_table:
                co_orthologs = [leaf.good_name for leaf in source_seqs]
                co_orthologs.sort()
            else:
                co_orthologs = [leaf.name for leaf in source_seqs]
                co_orthologs.sort()

            orthologs = defaultdict(set)
            for leaf in ortho_seqs:
                sp = str(leaf.name.split('.')[0])
                if args.conv_table:
                    orthologs[sp].add(leaf.good_name)
                else:
                    orthologs[sp].add(leaf.name)

            if len(source_seqs) == 1:
                _otype = "one-to-"
            else:
                _otype = "many-to-"

            for sp, orth in orthologs.items():
                if len(orth) == 1:
                    otype = _otype + "one"
                else:
                    otype = _otype + "many"

                event_lines.append('\t'.join([
                    ','.join(co_orthologs), otype,
                    str(sp), ','.join(sorted(orth)), treefile, names[sp], '\n'
                ]))

            if args.pairs_table:

                source_seqs_names = []
                ortho_seqs_names = []

                for node in source_seqs:
                    for leaf in node:
                        if args.conv_table:
                            name = leaf.good_name
                        else:
                            name = leaf.name
                        source_seqs_names.append(name)

                for node in ortho_seqs:
                    for leaf in node:
                        if args.conv_table:
                            name = leaf.good_name
                        else:
                            name = leaf.name
                        ortho_seqs_names.append(name)

                all_ortholgs_node = itertools.product(source_seqs_names,
                                                      ortho_seqs_names)
                all_ortholgs_tree.append(all_ortholgs_node)

                for node in all_ortholgs_tree:
                    for pair in node:
                        all_ortholgs_pairs.append(pair)

                #return (event_lines, all_ortholgs_pairs)

    if args.pairs_table:
        return (event_lines, all_ortholgs_pairs)
    else:
        return (event_lines)