def extract_clades(newick_file, processed_newick_out=None): """ the outer logic for tree splitting """ # preprocess tree print("Pre-processing tree ({})".format(newick_file)) tree = PhyloTree(newick_file) R = tree.get_midpoint_outgroup() tree.set_outgroup(R) tree.ladderize() tree.convert_to_ultrametric() if (processed_newick_out is not None): tree.write(format=1, outfile=processed_newick_out) # calculate clades print("Calling clades ({})".format(newick_file)) def get_branch_length(node): for l in node: return l.get_distance(node) len_tree = len(tree) dist_tree = get_branch_length(tree) def condition_discard(node, tree): return (len(node) < 3) def condition_ok(node, tree): return len(node) < max(10, len_tree / 50) branches = get_pruned_branch(tree, tree, condition_discard, condition_ok, []) clades = {} for i, branch in enumerate( sorted(branches, key=lambda nodes: -1 * len(nodes))): clades[str(i + 1)] = [node.name for node in branch] return clades
def yes_choice(tree_file_name, gene, algae_choice): t = PhyloTree(tree_file_name) R = t.get_midpoint_outgroup() t.set_outgroup(R) gene_names = t.get_leaf_names() if algae_choice[0] == "y": print("\nFirst, let's define the algae clade.") algae_list = clade_to_tree(t) else: algae_list = [] outlier_choice = raw_input( "\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)" ) if outlier_choice[0] == "y": print( "\nLet's define the outlier group. \nFirst you can add any spceices that are in a monophyletic clade" ) outlier_list = clade_to_tree(t) other_copies = raw_input( "If there are other genes in the outlier group, enter them here, separated by a space, or else enter n." ) if other_copies != "n": other_list = other_copies.split(" ") outlier_list = outlier_list + other_list else: outlier_list = [] print( "\nSelect one monophyletic family to define. \nYou will have a later chance to split this family again if needed." ) group_list = clade_to_tree(t) ###tree1 cut_list = [i for i in gene_names if i not in group_list] cut_list = cut_list + algae_list + outlier_list gene1 = yesMake(cut_list, gene, tree_file_name) ###tree2 cut_list1 = [i for i in gene_names if i not in cut_list] cut_list1 = cut_list1 + algae_list + outlier_list gene2 = yesMake(cut_list1, gene1, tree_file_name) with open(sys.argv[2], "r") as f: todo_list = [line.rstrip() for line in f] todo_list = [i for i in todo_list if i != gene] todo_list.append(gene1) todo_list.append(gene2) with open(sys.argv[2], "w") as todo: for i in todo_list: todo.write(i + "\n")
def yes_choice(tree_file_name, gene, algae_choice): t=PhyloTree(tree_file_name) R = t.get_midpoint_outgroup() t.set_outgroup(R) gene_names = t.get_leaf_names() if algae_choice[0] == "y": print("\nFirst, let's define the algae clade.") algae_list = clade_to_tree(t) else: algae_list = [] outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)") if outlier_choice[0] == "y": print("\nLet's define the outlier group. \nFirst you can add any spceices that are in a monophyletic clade") outlier_list = clade_to_tree(t) other_copies = raw_input("If there are other genes in the outlier group, enter them here, separated by a space, or else enter n.") if other_copies != "n": other_list = other_copies.split(" ") outlier_list = outlier_list + other_list else: outlier_list=[] print("\nSelect one monophyletic family to define. \nYou will have a later chance to split this family again if needed.") group_list = clade_to_tree(t) ###tree1 cut_list = [i for i in gene_names if i not in group_list] cut_list = cut_list + algae_list + outlier_list gene1 = yesMake(cut_list, gene, tree_file_name) ###tree2 cut_list1 = [i for i in gene_names if i not in cut_list] cut_list1 = cut_list1 + algae_list + outlier_list gene2 = yesMake(cut_list1, gene1, tree_file_name) with open(sys.argv[2], "r") as f: todo_list=[line.rstrip() for line in f] todo_list=[i for i in todo_list if i != gene] todo_list.append(gene1) todo_list.append(gene2) with open(sys.argv[2], "w") as todo: for i in todo_list: todo.write(i+"\n")
def calculate_nodes(self): """Method to calculate the different internal node scores for a given calculus method, and store those values both in a dictionary (if the user wants to) and in an instance of a processed tree. """ try: tree = PhyloTree(self.tree_in, alignment=self.align_in, alg_format="fasta") md = tree.get_midpoint_outgroup() tree.set_outgroup(md) leaf_deleting_list = set() if self.position_matrix == None: uniprot_hit_hash, leaf_deleting_list = fp.retrieve_features( self.study_features, self.table_info, self.min_eval, self.uniprot_info) self.position_matrix = fp.get_positions_matrix( uniprot_hit_hash, tree ) # If we want to update the features, we have to delete the position matrix (with update method) for leaf in tree.iter_leaves(): if leaf.name in leaf_deleting_list: leaf.delete() node_number = 0 node_scores = {} node_haplotypes = {} node_haplotype_matrices = {} node_haplotype_logos = {} for index, node in enumerate(tree.traverse("preorder")): node._nid = index if node.is_leaf() == False: node_sequence_matrix = fp.annotated_sequence_extractor( node, self.position_matrix, self.differentiate_gaps) node_score = round( fp.calculate_node_score(node_sequence_matrix, self.calc_alg), 2) node.add_feature("node_score", node_score) node_scores[node_number] = node_score node_haplotype = fp.haplotype_parse(node_sequence_matrix) node.add_feature("node_haplotype", node_haplotype) node_haplotypes[node_number] = node_haplotype if self.compute_logos == "Y": node_haplotype_matrix = fp.haplotype_matrix_calculator( node_sequence_matrix) node.add_feature("node_haplotype_matrix", node_haplotype_matrix) node_haplotype_matrices[ node_number] = node_haplotype_matrix if node_haplotype_matrix is not None: node_haplotype_logo = logomaker.Logo( node_haplotype_matrix, color_scheme="dmslogo_funcgroup", show_spines=False) node_haplotype_logo = node_haplotype_logo.fig else: node_haplotype_logo = None node.add_feature("node_haplotype_logo", node_haplotype_logo) node_haplotype_logos[node_number] = node_haplotype_logo node_number += 1 self.processed_tree = tree self.node_scores = node_scores self.node_haplotypes = node_haplotypes self.node_haplotype_matrices = node_haplotype_matrices self.node_haplotype_logos = node_haplotype_logos except: sys.stderr.write("Error at calculating nodes.\n") sys.exit(1) return
""" from ete3 import PhyloTree sample_list = [ r"D:\Users\suuser\Desktop\ADRB2_Trimmed.txt", r"D:\Users\suuser\Desktop\ACM2_Trimmed.txt", r"D:\Users\suuser\Desktop\AA2AR_Trimmed.txt", r"D:\Users\suuser\Desktop\OPRM_Trimmed.txt" ] file_name = ["ADRB2", "ACM2", "AA2AR", "OPRM"] for sample in sample_list: idx = sample_list.index(sample) file = file_name[idx] t = open(sample, "r") line = t.readline() tree = PhyloTree(line) R = tree.get_midpoint_outgroup() tree.set_outgroup(R) #for rooting of the tree for node in tree.traverse(): leaves = node.get_children() #print (leaves) if len(leaves) < 2 or node.is_leaf(): continue #leaf1=leaves[0].get_leaves() #print (leaf1) leaf2 = leaves[1].get_leaves() for i in leaf2: leaf_node = str(i).split("|")[-1] print(leaf_node) if leaf_node == "9606": node.swap_children() break
def pre_prune(gene): full_tree=PhyloTree(gene+"/"+gene+".3.fa.tre") gene_names=full_tree.get_leaf_names() m=100 start_gene="{}_all{}".format(gene,str(m)) os.system("mkdir {}".format(start_gene)) full_tree.write(format=1, outfile="{}/{}.3.fa.tre".format(start_gene,start_gene)) m=m+1 l=[start_gene] for item in l: full_tree=PhyloTree("{}/{}.3.fa.tre".format(item,item)) view_rooted_tree(full_tree) print("Tree for {}".format(item)) c=raw_input("Split off a monophyletic gene copy? (y/n)") if c[0] == "y": algae_choice = raw_input("\nIs there an algae group that is sister to all shown families? (y/n)") outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)") while c[0]=="y": if algae_choice[0] == "y": print("\nFirst, let's define the algae clade.") algae_list = clade_to_tree(full_tree) else: algae_list = [] if outlier_choice[0] == "y": print("\nLet's define the outlier group. ") outlier_list = [] out_choice = raw_input("\nIs there a monophyletic clade in the outlier group? (y/n)") while out_choice[0] == "y": outlier_list2 = clade_to_tree(full_tree) outlier_list = outlier_list + outlier_list2 out_choice = raw_input("\nIs there another monopyletic clade to add to the outlier group? (y/n)") other_choice = raw_input("Are there additional genes in the outlier group? (y/n)") while other_choice[0] == "y": other_copies = raw_input("\nEnter genes to include, separated by a space. Enter only up to ten genes at a time.") try: other_list = other_copies.split(" ") outlier_list = outlier_list + other_list except ValueError: other_choice = raw_input("\nAt least one gene is not found on the tree. Reenter genes? y/n") other_choice = raw_input("Are there more genes to enter? (y/n)") else: outlier_list=[] b="{}_all{}".format(gene, str(m)) l.append(b) tree1=PhyloTree("{}/{}.3.fa.tre".format(item,item)) R=tree1.get_midpoint_outgroup() tree1.set_outgroup(R) print("\nFor the monophyletic gene copy:") group_list=clade_to_tree(tree1) group_list=group_list + algae_list + outlier_list gene_names=tree1.get_leaf_names() if len(group_list)==len(gene_names): c1=raw_input("\nList includes all copies on tree.\nMake gene with all copies? (y/n)") if c1=="y": c="n" else: print("\nGroup crosses root. Unable to make group.\nChoose new group.") c="y" else: cut_list=[i for i in gene_names if i not in group_list] cut_list = cut_list + algae_list + outlier_list os.system("mkdir {}".format(b)) tree2=PhyloTree("{}/{}.3.fa.tre".format(item,item)) R=tree2.get_midpoint_outgroup() tree2.set_outgroup(R) tree2.prune(group_list,preserve_branch_length=True) tree2.write(format=1, outfile="{}/{}.3.fa.tre".format(b,b)) tree1.prune(cut_list,preserve_branch_length=True) tree1.write(format=1, outfile="{}/{}.3.fa.tre".format(item,item)) m=m+1 print ("\nTree now looks like this.") view_rooted_tree(tree1) c=raw_input("Split off a monophyletic clade? (y/n)") if c[0] == "y": algae_choice = raw_input("\nIs there an algae group that is sister to all shown families? (y/n)") outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)") with open(sys.argv[1], "a") as p: for i in l: p.write(i+"\n")
# libraries from ete3 import PhyloTree # read tree from file phy = PhyloTree("adar_hol.01.iqt.contree.newick") # assign species names to tree phy.set_species_naming_function(lambda node: node.name.split("_")[0]) for n in phy.get_leaves(): print("node:", n.name, "Species name:", n.species) # root tree phy_outgroup = phy.get_midpoint_outgroup() phy.set_outgroup(phy_outgroup) # find evolutionary events evev = phy.get_descendant_evol_events(sos_thr=0.9) for ev in evev: if ev.etype == "S": print(ev.orthologs) # find evolutionary events evev = phy.get_descendant_evol_events(sos_thr=0.9) # all events for ev in evev: print(ev.etype, ','.join(ev.in_seqs), "<====>", ','.join(ev.out_seqs)) # all events involving either Hsap or Drer fseqs = lambda slist: [
N = AttrFace("name", fsize=34, fgcolor="black") N.margin_left = 20 N.margin_right = 20 faces.add_face_to_node(N, node, column=0) t = sys.argv[1] alg = sys.argv[2] predic_table = sys.argv[3] out_img_name = sys.argv[4] #bilaterian_desc = ncbi.get_descendant_taxa('Bilateria', collapse_subspecies=True) #print type(bilaterian_desc) tree_upp = PhyloTree(newick=t, alignment=alg, alg_format="fasta") R = tree_upp.get_midpoint_outgroup() tree_upp.set_outgroup(R) predict_name = {} for line in open(predic_table): if line.rstrip() and not line.startswith("#"): prot_name = line.split("\t")[0] pred_name = line.split("\t")[5] predict_name[prot_name] = pred_name ts = TreeStyle() ts.show_leaf_name = False ts.tree_width = 2000 ts.layout_fn = layout
protein_annotations[s] = 'b' else: protein_annotations[s] = 'c' # load in the tree t = PhyloTree('temp/queryCOG.hmmhits.fasta.aln.trimal.treefile') # try to root with largest paralog clade try: m = 0 for clade in t.get_monophyletic(values=["c", "b"], target_attr="annot"): if len([l for l in clade.get_leaves()]) > m: rooting_clade = clade m = len([l for l in clade.get_leaves()]) t.set_outgroup(clade) except: midpoint = t.get_midpoint_outgroup() if midpoint: t.set_outgroup(midpoint) annotate_leaf_proteins(t) # find the clade with the mapped sequence for l in t.iter_descendants(): if l.name == mapped_seq: clade = l n = 0 while n == 0: if clade.up: annotations = [ leaf.annot for sister in clade.get_sisters() for leaf in sister.get_leaves() ] counts = dict(Counter(annotations)) if 'c' not in counts.keys():
def pre_prune(gene): full_tree = PhyloTree(gene + "/" + gene + ".3.fa.tre") gene_names = full_tree.get_leaf_names() m = 100 start_gene = "{}_all{}".format(gene, str(m)) os.system("mkdir {}".format(start_gene)) full_tree.write(format=1, outfile="{}/{}.3.fa.tre".format(start_gene, start_gene)) m = m + 1 l = [start_gene] for item in l: full_tree = PhyloTree("{}/{}.3.fa.tre".format(item, item)) view_rooted_tree(full_tree) print("Tree for {}".format(item)) c = raw_input("Split off a monophyletic gene copy? (y/n)") if c[0] == "y": algae_choice = raw_input( "\nIs there an algae group that is sister to all shown families? (y/n)" ) outlier_choice = raw_input( "\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)" ) while c[0] == "y": if algae_choice[0] == "y": print("\nFirst, let's define the algae clade.") algae_list = clade_to_tree(full_tree) else: algae_list = [] if outlier_choice[0] == "y": print("\nLet's define the outlier group. ") outlier_list = [] out_choice = raw_input( "\nIs there a monophyletic clade in the outlier group? (y/n)" ) while out_choice[0] == "y": outlier_list2 = clade_to_tree(full_tree) outlier_list = outlier_list + outlier_list2 out_choice = raw_input( "\nIs there another monopyletic clade to add to the outlier group? (y/n)" ) other_choice = raw_input( "Are there additional genes in the outlier group? (y/n)") while other_choice[0] == "y": other_copies = raw_input( "\nEnter genes to include, separated by a space. Enter only up to ten genes at a time." ) try: other_list = other_copies.split(" ") outlier_list = outlier_list + other_list except ValueError: other_choice = raw_input( "\nAt least one gene is not found on the tree. Reenter genes? y/n" ) other_choice = raw_input( "Are there more genes to enter? (y/n)") else: outlier_list = [] b = "{}_all{}".format(gene, str(m)) l.append(b) tree1 = PhyloTree("{}/{}.3.fa.tre".format(item, item)) R = tree1.get_midpoint_outgroup() tree1.set_outgroup(R) print("\nFor the monophyletic gene copy:") group_list = clade_to_tree(tree1) group_list = group_list + algae_list + outlier_list gene_names = tree1.get_leaf_names() if len(group_list) == len(gene_names): c1 = raw_input( "\nList includes all copies on tree.\nMake gene with all copies? (y/n)" ) if c1 == "y": c = "n" else: print( "\nGroup crosses root. Unable to make group.\nChoose new group." ) c = "y" else: cut_list = [i for i in gene_names if i not in group_list] cut_list = cut_list + algae_list + outlier_list os.system("mkdir {}".format(b)) tree2 = PhyloTree("{}/{}.3.fa.tre".format(item, item)) R = tree2.get_midpoint_outgroup() tree2.set_outgroup(R) tree2.prune(group_list, preserve_branch_length=True) tree2.write(format=1, outfile="{}/{}.3.fa.tre".format(b, b)) tree1.prune(cut_list, preserve_branch_length=True) tree1.write(format=1, outfile="{}/{}.3.fa.tre".format(item, item)) m = m + 1 print("\nTree now looks like this.") view_rooted_tree(tree1) c = raw_input("Split off a monophyletic clade? (y/n)") if c[0] == "y": algae_choice = raw_input( "\nIs there an algae group that is sister to all shown families? (y/n)" ) outlier_choice = raw_input( "\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)" ) with open(sys.argv[1], "a") as p: for i in l: p.write(i + "\n")
def process_tree(treepath): ''' processes a tree to extract orthology relationships between target taxid and the rest of species, organized by orthology type and species code ''' treepath = str(treepath) treepath = treepath.rstrip() t = PhyloTree(treepath, sp_naming_function=get_species) # traverse all leaves in tree file and get taxid leaf_count = 0 for leaf in t: leaf_count += 1 tax = int(leaf.name.split(".", 1)[0]) #get scientific name and convert taxid from int to str sci_name = names.get(tax) leaf.taxid = str(tax) #rename leaves names try: good_name = "%s" % (conversion[leaf.name][0]) except: good_name = leaf.name good_name = re.sub("[ |\t,:)(;\n\]\[]+", "_", good_name) leaf.good_name = good_name #obtain cluster name from tree file path clus_name = os.path.split(treepath)[-1].replace(".fa.final_tree.nw", "") try: base_name = conversion[clus_name][0].replace('|', '_') except: base_name = clus_name[0] t.dist = 0 #colapses plat specific node2content = t.get_cached_content() target_species = set([target_taxid]) def is_sp_specific(_node): _species = set([_leaf.species for _leaf in node2content[_node]]) if not (_species - target_species): return True return False #traverse only lamprey leaves if collapse == 'yes': for n in t.get_leaves(is_leaf_fn=is_sp_specific): if n.children: for ch in n.get_children(): ch.detach() n.taxid = target_taxid n.name = "%s" % ('|'.join( [_lf.name for _lf in node2content[n]])) n.good_name = "{%s}" % ('|'.join( [_lf.good_name for _lf in node2content[n]])) #set outgroup outgroup = t.get_midpoint_outgroup() try: t.set_outgroup(outgroup) except: if len(t) == 1: return else: raise node2content = t.get_cached_content() event_lines = [] for ev in t.get_descendant_evol_events(): if ev.etype == "S": source_seqs = node2content[ev.node.children[0]] ortho_seqs = node2content[ev.node.children[1]] sp_1 = set() for leaf in source_seqs: sp_1.add(leaf.taxid) sp_2 = set() for leaf in ortho_seqs: sp_2.add(leaf.taxid) if str(target_taxid) in sp_1: source_seqs, ortho_seqs = source_seqs, ortho_seqs elif str(target_taxid) in sp_2: source_seqs, ortho_seqs = ortho_seqs, source_seqs else: continue #co_orthologs is a list with lamprey seed in source_seqs co_orthologs = [ leaf.good_name for leaf in source_seqs if leaf.taxid == str(target_taxid) ] co_orthologs.sort() #orthologs is a list of all ortho_seqs names orthologs = defaultdict(set) for leaf in ortho_seqs: sp = int(leaf.taxid) orthologs[sp].add(leaf.good_name) if len(co_orthologs) == 1: _otype = "one-to-" else: _otype = "many-to-" for sp, orth in orthologs.iteritems(): if len(orth) == 1: otype = _otype + "one" else: otype = _otype + "many" event_lines.append('\t'.join([ ','.join(co_orthologs), otype, str(sp), names[sp], ','.join(sorted(orth)), '\n' ])) return event_lines
def process_file(file): ## extract index index = file.split('.')[0] ## create output directory index_dir = out_dir / index index_dir.mkdir(parents=True, exist_ok=True) ## perform local search against each database for file_db in list_files: search_output = index + '_' + file_db.replace('.fas','.gz') subprocess.check_output(path_diamond + ' blastp --quiet --threads 1 --db ' + str(db_dir / file_db.replace('.fas','.db')) + ' --max-target-seqs ' + str(max_per_species) + ' --query ' + str(Path('dir_step1') / file) + ' \ --compress 1 --more-sensitive -e ' + str(evalue) + ' -o ' + str(index_dir / search_output) + ' --outfmt 6 qseqid sseqid qstart qend sstart cigar 2>&1', shell=True) ## get all DIAMOND output files p = index_dir.glob('*.gz') tmp_l = [x for x in p if x.is_file()] ## get all hits in a dict of list all_output = collections.defaultdict(list) for out_file in tmp_l: with gzip.open(out_file, mode="rt") as f: file_content = csv.reader(f, delimiter=' ') for line in file_content: # save output all_output[line[0]].append(line[1:]) ## analyse BLAST hits nb_phylo = 0 nb_NO_phylo = 0 nb_empty_ali = 0 all_alis = dict() no_phylo = dict() for prot in all_output: ## variable for reduced list of output reduced = list() ## get all species hits (initialise with query prot) ref_species = dict(all_species) ref_species[name_2_sp_phylip_seq[prot][0]] += 1 all_hits = {prot} for ll in all_output[prot]: target = ll[0] target_sp = name_2_sp_phylip_seq[target][0] if target not in all_hits: ref_species[target_sp] += 1 all_hits.add(target) # reduce output for pickle (convert all element to integers) reduced.append(tuple(int(x) for x in ll[:3])) ## analyse species content nb_present, nb_dupli = analyse_species(ref_species) ## case phylogenetic analysis if nb_present > 1 and nb_dupli > 0: nb_phylo += 1 min_start = math.inf max_end = 0 all_hits = dict() # get all hits for this prot for ll in all_output[prot]: target = ll[0] species = name_2_sp_phylip_seq[target][0] qu_start = int(ll[1]) -1 qu_end = int(ll[2]) -1 ta_start = int(ll[3]) -1 cigar = ll[4] if target in all_hits: # extract HSP and add to target seq HSP = extract_HSP(name_2_sp_phylip_seq[target][2], ta_start, cigar) all_hits[target] = all_hits[target][:qu_start] + HSP + all_hits[target][qu_end + 1:] min_start, max_end = process_location(qu_start, qu_end, min_start, max_end) else: ref_species[species] += 1 # create target seq all_hits[target] = '-' * len(name_2_sp_phylip_seq[prot][2]) # extract HSP HSP = extract_HSP(name_2_sp_phylip_seq[target][2], ta_start, cigar) all_hits[target] = all_hits[target][:qu_start] + HSP + all_hits[target][qu_end + 1:] min_start, max_end = process_location(qu_start, qu_end, min_start, max_end) # add query to hits if not there (it happens sometimes when many similar sequences from the same species) if prot not in all_hits: all_hits[prot] = name_2_sp_phylip_seq[prot][2] # find good positions good_positions = get_positions(prot, all_hits, trim_thres) # save alignment if len(good_positions) == 0: nb_empty_ali += 1 elif len(good_positions) < 4988: ## FastTtree2 limitation (5000 per line) new_ali = [str(len(all_hits)) + ' ' + str(len(good_positions))] for name, seq in all_hits.items(): trimed_seq = [seq[n] for n in range(len(seq)) if n in good_positions] new_ali.append(name_2_sp_phylip_seq[name][1] + ''.join(trimed_seq)) all_alis[prot] = '\n'.join(new_ali) else: # take only the 4988 first positions (longer alignments are very rare anyway) new_ali = [str(len(all_hits)) + ' 4988'] for name, seq in all_hits.items(): trimed_seq = [seq[n] for n in range(len(seq)) if n in good_positions][:4988] new_ali.append(name_2_sp_phylip_seq[name][1] + ''.join(trimed_seq)) all_alis[prot] = '\n'.join(new_ali) ## case NO phylogenetic analysis else: nb_NO_phylo += 1 # sort the list of names for further processing xx = list(all_hits) xx.sort() no_phylo[prot] = xx ## save reduced output all_output[prot] = tuple(reduced) ## convert DIAMOND output (keys to integers) and save it to file all_output = {int(x):t for x,t in all_output.items()} output_file = index + '_output.pic' utils.save_pickle(out_dir / 'dict_output' / output_file, all_output) ## save similarity_ortho groups to file blast_ortho_file = index + '_similarity_ortho.pic' utils.save_pickle(out_dir / 'dict_similarity_ortho' / blast_ortho_file, no_phylo) ## save all alignments to file name_ali_file = 'alis_' + index + '.phy' write_ali = open(out_dir / name_ali_file, 'w+') all_ref_prot = list() for ref_prot, ali in all_alis.items(): write_ali.write(ali + '\n') all_ref_prot.append(ref_prot) write_ali.close() # free memory nb_alis = len(all_alis) all_alis = None all_output = None ## deal with method if phylo_method == 'nj': insert = '-noml -nome' elif phylo_method == 'me': insert = '-noml' elif phylo_method == 'ml': insert = '' ## perform phylogenetic analyses and root trees all_trees = dict() nb_pbm_tree = 0 a = subprocess.check_output(path_fasttree + ' -quiet -nosupport -fastest -bionj -pseudo ' + insert + ' -n ' + str(nb_alis) + ' ' + str(Path(out_dir / name_ali_file)) + ' 2>&1', shell=True) a2 = a.strip().decode("utf-8") a3 = a2.split('\n') c = -1 for line in a3: # case the line is in the form 'Ignored unknown character ...' if line.startswith('Ign'): pass else: c += 1 if not line.startswith('('): nb_pbm_tree += 1 # security if nb_pbm_tree > 100: sys.exit("\n ERROR STEP 2: too many errors in phylogenetic analyses -> stopped\n\n") else: # import tree in ete3 and root it ete_tree = PhyloTree(line) mid = ete_tree.get_midpoint_outgroup() try: ete_tree.set_outgroup(mid) except: pass # get reference protein name prot = all_ref_prot[c] # save rooted tree all_trees[prot] = ete_tree.write() ## save trees to file tree_file = index + '_trees.pic' utils.save_pickle(out_dir / 'dict_trees' / tree_file, all_trees) ## clean directory # delete ali file Path.unlink(out_dir / name_ali_file) # delete Diamond outputs shutil.rmtree(index_dir) return [index, str(nb_phylo), str(nb_NO_phylo), str(nb_empty_ali), str(nb_pbm_tree)]
#load a tree and associated alignment #treefile = '/home/cactuskid/Dropbox/IIB/mergeLineages/yuyo/restricted/hapAndeff/strcutres_and_tcoffeeset_aln_struct.phy_phyml_tree.txtlabels.txt' folder = '/home/cactuskid/Dropbox/IIB/mergeLineages/yuyo/*/*labels.txt' #folder = '/home/cactuskid/Dropbox/IIB/mergeLineages/yuyo/*/*/*labels.txt' treefiles = glob.glob(folder) for treefile in treefiles: print treefile colorSepcies = False #alg = '/home/cactuskid/Dropbox/IIB/mergeLineages/phylogeny/hybrid/merged_curate_aln.fasta' t = PhyloTree( treefile, sp_naming_function=None) #, alignment=alg, alg_format="fasta") # Calculate the midpoint node R = t.get_midpoint_outgroup() # and set it as tree outgroup t.set_outgroup(R) def save_obj(obj, name ): with open( name + '.pkl', 'wb') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) def load_obj(name ): with open( name + '.pkl', 'r') as f: return pickle.load(f) genedict = load_obj('genedict') speciescolors = load_obj('colors') red = Color('red') blue = Color('blue')
def process_tree(treepath): ''' processes a tree to extract orthology relationships between target taxid and the rest of species, organized by orthology type and species code ''' treepath = str(treepath) treepath = treepath.rstrip() t = PhyloTree(treepath, sp_naming_function=get_species) treefile = os.path.basename(treepath) t.dist = 0 outgroup = t.get_midpoint_outgroup() try: t.set_outgroup(outgroup) t.standardize() except: if args.pairs_table: if len(t) == 1: sys.stderr.write(treefile + 'len(t) == 1' + '\n') return ([], []) #return (['aa', 'aa'] ,[['aa', 'aa']]) else: sys.stderr.write(treefile + 'len(t) != 1' + '\n') l = t.get_leaf_names() r = l[0] t.set_outgroup(r) pass #return ([],[]) #return (['None', 'None'] ,[['None', 'None']]) else: if len(t) == 1: sys.stderr.write(treefile + 'len(t) == 1' + '\n') return [] else: sys.stderr.write(treefile + 'len(t) != 1' + '\n') return [] names = {} for leaf in t: try: sp = str(leaf.name.split('.')[0]) leaf.taxid = str(sp) sci_name = ncbi.get_taxid_translator([sp]) names[sp] = sci_name[int(sp)] except: names[sp] = '' if args.conv_table: try: good_name = "%s" % (conversion[leaf.name][0]) except: good_name = leaf.name leaf.good_name = good_name node2content = t.get_cached_content() target_species = set([target_taxid]) def is_sp_specific(_node): _species = set([_leaf.species for _leaf in node2content[_node]]) if not (_species - target_species): return True return False #traverse only target taxid leaves if collapse == 'yes': for n in t.get_leaves(is_leaf_fn=is_sp_specific): if n.children: for ch in n.get_children(): ch.detach() n.taxid = target_taxid n.name = "{%s}" % ('|'.join( [_lf.name for _lf in node2content[n]])) if args.conv_table: n.good_name = "{%s}" % ('|'.join( [_lf.good_name for _lf in node2content[n]])) all_ortholgs_tree = [] all_ortholgs_pairs = [] event_lines = [] for ev in t.get_descendant_evol_events(): if ev.etype == "S": source_seqs = ev.node.children[0] ortho_seqs = ev.node.children[1] if target_taxid: sp_1 = set() for leaf in source_seqs: sp_1.add(leaf.taxid) sp_2 = set() for leaf in ortho_seqs: sp_2.add(leaf.taxid) if str(target_taxid) in sp_1: source_seqs, ortho_seqs = source_seqs, ortho_seqs elif str(target_taxid) in sp_2: source_seqs, ortho_seqs = ortho_seqs, source_seqs else: continue if args.conv_table: co_orthologs = [leaf.good_name for leaf in source_seqs] co_orthologs.sort() else: co_orthologs = [leaf.name for leaf in source_seqs] co_orthologs.sort() orthologs = defaultdict(set) for leaf in ortho_seqs: sp = str(leaf.name.split('.')[0]) if args.conv_table: orthologs[sp].add(leaf.good_name) else: orthologs[sp].add(leaf.name) if len(source_seqs) == 1: _otype = "one-to-" else: _otype = "many-to-" for sp, orth in orthologs.items(): if len(orth) == 1: otype = _otype + "one" else: otype = _otype + "many" event_lines.append('\t'.join([ ','.join(co_orthologs), otype, str(sp), ','.join(sorted(orth)), treefile, names[sp], '\n' ])) if args.pairs_table: source_seqs_names = [] ortho_seqs_names = [] for node in source_seqs: for leaf in node: if args.conv_table: name = leaf.good_name else: name = leaf.name source_seqs_names.append(name) for node in ortho_seqs: for leaf in node: if args.conv_table: name = leaf.good_name else: name = leaf.name ortho_seqs_names.append(name) all_ortholgs_node = itertools.product(source_seqs_names, ortho_seqs_names) all_ortholgs_tree.append(all_ortholgs_node) for node in all_ortholgs_tree: for pair in node: all_ortholgs_pairs.append(pair) #return (event_lines, all_ortholgs_pairs) if args.pairs_table: return (event_lines, all_ortholgs_pairs) else: return (event_lines)