def main(inDIR, file_ending, branch_len_cutoff, min_taxa, outDIR): """cut long branches and output subtrees as .subtre files if uncut and nothing changed betwee .tre and .subtree copy the original .tre file to the outdir""" if inDIR[-1] != "/": inDIR += "/" if outDIR[-1] != "/": outDIR += "/" min_taxa = int(min_taxa) filecount = 0 cutoff = float(branch_len_cutoff) print "cutting branches longer than", cutoff for i in os.listdir(inDIR): if not i.endswith(file_ending): continue print i filecount += 1 with open(inDIR + i, "r") as infile: #only 1 tree in each file intree = newick3.parse(infile.readline()) try: with open(inDIR + i[:i.find(".tre")] + ".tre", "r") as infile: #the original .tre raw_tree_size = len( get_front_labels(newick3.parse(infile.readline()))) except: # did not refine this round. Use the .tre.tt.mm tree raw_tree_size = len(get_front_labels(intree)) num_taxa = count_taxa(intree) if num_taxa < min_taxa: print "Tree has", num_taxa, "less than", min_taxa, "taxa" else: print ".tre:", raw_tree_size, "tips; " + file_ending + ": " + str( len(get_front_labels(intree))) + " tips" subtrees = cut_long_internal_branches(intree, cutoff) if len(subtrees) == 0: print "No tree with at least", min_taxa, "taxa" #elif raw_tree_size == len(subtrees[0].leaves()): #copy(inDIR+i,outDIR+i) #print "written to out directory unchanged" else: count = 0 outsizes = "" for subtree in subtrees: if count_taxa(subtree) >= min_taxa: if subtree.nchildren == 2: #fix bifurcating roots from cutting temp, subtree = remove_kink(subtree, subtree) count += 1 with open( outDIR + i.split(".")[0] + "_" + str(count) + ".subtree", "w") as outfile: outfile.write(newick3.tostring(subtree) + ";\n") outsizes += str(len(subtree.leaves())) + ", " print count, "tree(s) wirtten. Sizes:", outsizes assert filecount > 0, "No file end with " + file_ending + " in " + inDIR
def taxon_name_subst(original,table=sys.path[0]+"/reverse_taxon_table"): DICT = {} # key is seq acronym, value is full taxon name, separated by tab with open(table, "r") as infile: for line in infile: spls = line.strip().split("\t") if len(spls) > 1: DICT[spls[0].replace("|","_")] = spls[1] with open(original,"r") as infile: line = infile.readline() is_fasta = True if line[0] == ">" else False if is_fasta: # for fasta files infile = open(original,"r") outfile = open(original+".name","w") for line in infile: if line[0] == ">": outfile.write('>'+get_long_id(line.strip()[1:],DICT)+"\n") else: outfile.write(line) infile.close() outfile.close() else: # tree file with open(original,"r") as infile: intree = newick3.parse(infile.readline()) for i in intree.leaves(): print i.label, i.label = get_long_id(i.label,DICT) print i.label with open(original+".name","w") as outfile: outfile.write(newick3.tostring(intree)+";\n")
def main(fasta, treDIR, tree_file_ending, outDIR): if treDIR[-1] != "/": treDIR += "/" if outDIR[-1] != "/": outDIR += "/" print "Reading fasta file", fasta seqDICT = {} #key is seqID, value is seq for s in read_fasta_file(fasta): seqDICT[s.name] = s.seq print "Writing fasta files" filecount = 0 for i in os.listdir(treDIR): if i.endswith(tree_file_ending): print i filecount += 1 with open(treDIR + i, "r") as infile: intree = newick3.parse(infile.readline()) clusterID = tree_utils.get_clusterID(i) if clusterID.endswith("rr"): outname = outDIR + clusterID + "_rr.fa" else: outname = outDIR + clusterID + "rr.fa" with open(outname, "w") as outfile: for label in tree_utils.get_front_labels(intree): outfile.write(">" + label + "\n" + seqDICT[label] + "\n") assert filecount > 0,\ "No file ends with "+tree_file_ending+" found in "+treDIR
def main(treDIR, clnDIR, para, intree_file_ending=INTREE_FILE_ENDING): if treDIR[-1] != "/": treDIR += "/" if clnDIR[-1] != "/": clnDIR += "/" assert para == "y" or para == "n", "mask paraphyletic tips? (y/n)" mask_para = True if para == "y" else False filecount = 0 filematch = {} #key is clusterID, value is the .aln-cln file for i in os.listdir(clnDIR): if i.endswith(".aln-cln"): clusterID = get_clusterID(i) assert clusterID not in filematch, \ "The clusterID "+clusterID+" repeats in "+clnDIR filematch[clusterID] = i for i in os.listdir(treDIR): if i.endswith(intree_file_ending): with open(treDIR + i, "r") as infile: intree = newick3.parse(infile.readline()) print i clusterID = get_clusterID(i) filecount += 1 chrDICT = {} #key is seqid, value is number of unambiguous chrs for s in read_fasta_file(clnDIR + filematch[clusterID]): for ch in ['-', 'X', "x", "?", "*"]: s.seq = s.seq.replace(ch, "") #ignore gaps, xs and Xs chrDICT[s.name] = len(s.seq) curroot = mask_monophyletic_tips(intree, chrDICT) if mask_para: curroot = mask_paraphyletic_tips(curroot, chrDICT) with open(treDIR + i + ".mm", "w") as outfile: outfile.write(newick3.tostring(curroot) + ";\n") assert filecount > 0, \ "No file ends with "+intree_file_ending+" found in "+treDIR
def main(treDIR, clnDIR, para, intree_file_ending=INTREE_FILE_ENDING, ignore=GENOMES): if treDIR[-1] != "/": treDIR += "/" if clnDIR[-1] != "/": clnDIR += "/" assert para == "y" or para == "n", "mask paraphyletic tips? (y/n)" mask_para = True if para == "y" else False filecount = 0 filematch = {} #key is clusterID, value is the .aln-cln file for i in os.listdir(clnDIR): if i.endswith(".aln-cln"): clusterID = get_clusterID(i) assert clusterID not in filematch, \ "The clusterID "+clusterID+" repeats in "+clnDIR filematch[clusterID] = i for i in os.listdir(treDIR): if i.endswith(intree_file_ending): with open(treDIR + i, "r") as infile: intree = newick3.parse(infile.readline()) print i clusterID = get_clusterID(i) filecount += 1 curroot = mask(intree, clnDIR + filematch[clusterID], para=mask_para, ignore=GENOMES) with open(treDIR + i + ".mm", "w") as outfile: outfile.write(newick3.tostring(curroot) + ";\n") assert filecount > 0, \ "No file ends with "+intree_file_ending+" found in "+treDIR
def ortho_to_aln(alndir, tredir, outdir, ortho_tree_file_ending=".tre"): """ Read final homolog write individual alignment files for each ortholog Shorten seq id to taxon id """ if alndir[-1] != "/": alndir += "/" if tredir[-1] != "/": tredir += "/" if outdir[-1] != "/": outdir += "/" filecount = 0 for i in os.listdir(tredir): if i.endswith(ortho_tree_file_ending): filecount += 1 print i #read in the alignment into an dictionary seqDICT = {} #key is seqID, value is seq for s in read_fasta_file(alndir + i.split(".")[0] + ".fa.mafft.aln"): seqDICT[s.name] = s.seq #read in tree tips and write output alignment with open(tredir + i, "r") as infile: intree = newick3.parse(infile.readline()) labels = tree_utils.get_front_labels(intree) with open(outdir + i.replace(ortho_tree_file_ending, ".aln"), "w") as outfile: for lab in labels: outfile.write(">" + tree_utils.get_name(lab) + "\n" + seqDICT[lab] + "\n") assert filecount > 0,\ "No file ends with "+ortho_tree_file_ending+" was found in "+tredir
def get_121(indir, tree_file_ending, min_taxa, outdir, min_bootstrap=0.0): if indir[-1] != "/": indir += "/" if outdir[-1] != "/": outdir += "/" min_taxa = int(min_taxa) min_bootstrap = float(min_bootstrap) infile_count, outfile_count = 0, 0 print "Filter one-to-one homologs with average bootstrap of at least",\ min_bootstrap for i in os.listdir(indir): if not i.endswith(tree_file_ending): continue infile_count += 1 with open(indir + i, "r") as infile: #only 1 tree in each file intree = newick3.parse(infile.readline()) names = get_front_names(intree) num_tips, num_taxa = len(names), len(set(names)) print "number of tips:", num_tips, "number of taxa:", num_taxa if num_tips == num_taxa and num_taxa >= min_taxa: if min_bootstrap > 0.0 and not pass_boot_filter( intree, min_bootstrap): continue print i, "written to out dir" outname = i.split(".")[1] + ".1to1ortho.tre" os.system("cp " + indir + i + " " + outdir + outname) outfile_count += 1 assert infile_count > 0,\ "No file ends with "+tree_file_ending+" was found in "+indir print infile_count, "files read,", outfile_count, "written to", outdir
def refine(query_fasta, start_fasta, deep_paralog_cutoff, num_cores): gene_name = get_filename_from_path(query_fasta)[1].split(".")[0] outdir, fasta = get_filename_from_path(start_fasta) #print outdir,fasta deep_paralog_cutoff = float(deep_paralog_cutoff) query_ids = [s.name for s in seq.read_fasta_file(query_fasta)] new_fasta = [] # list of output refined fasta files print outdir, fasta # make a tree from the start_fasta tree = fasta_to_tree.fasta_to_tree(outdir, fasta, num_cores, "aa") if tree == None: return [] with open(tree, "r") as infile: intree = newick3.parse(infile.readline()) root = trim_tips.trim(intree, relative_cutoff=deep_paralog_cutoff, absolute_cutoff=deep_paralog_cutoff * 2) if os.path.exists(outdir + fasta + ".pasta.aln-cln"): clnfile = outdir + fasta + ".pasta.aln-cln" else: clnfile = outdir + fasta + ".mafft.aln-cln" root = mask_tips_by_taxonID_transcripts.mask(root,\ clnfile=clnfile,\ para="y", ignore=GENOMES) if root != None: with open(tree + ".tt.mm", "w") as outfile: outfile.write(newick3.tostring(root) + "\n") subtrees = cut_long_internal_branches.cut_long_internal_branches( root, cutoff=deep_paralog_cutoff) count = 0 base_name = fasta.split(".")[0] seqDICT = {} # key is seqid, value is seq for s in seq.read_fasta_file(start_fasta): seqDICT[s.name] = s.seq for tree in subtrees: if tree == None: continue label_set = set(tree_utils.get_front_labels(tree)) if len(label_set) > 4 and len(label_set & set(query_ids)) > 0: count += 1 with open(outdir + base_name + "_" + str(count) + ".subtree", "w") as outfile: outfile.write(newick3.tostring(tree) + ";\n") with open(outdir + base_name + "_" + str(count) + ".fa", "w") as outfile: for seqid in tree_utils.get_front_labels(tree): try: outfile.write(">" + seqid + "\n" + seqDICT[seqid] + "\n") except: print seqid, "not found in fasta file" new_fasta.append(outdir + base_name + "_" + str(count) + ".fa") return new_fasta
def main(DIR, tree_file_ending, relative_cut, absolute_cut): if DIR[-1] != "/": DIR += "/" filecount = 0 for i in os.listdir(DIR): if i.endswith(tree_file_ending): print i filecount += 1 with open(DIR + i, "r") as infile: intree = newick3.parse(infile.readline()) outtree = trim(intree, float(relative_cut), float(absolute_cut)) if outtree != None: with open(DIR + i + ".tt", "w") as outfile: outfile.write(newick3.tostring(outtree) + ";\n") assert filecount > 0, \ "No file end with "+tree_file_ending+" found in "+DIR
def main(DIR,tree_file_ending,relative_cut,absolute_cut1,absolute_cut2): if DIR[-1] != "/": DIR += "/" filecount = 0 for i in os.listdir(DIR): if i.endswith(tree_file_ending): print i filecount += 1 with open(DIR+i,"r") as infile: intree = newick3.parse(infile.readline()) outtree = trim(intree,float(relative_cut),float(absolute_cut1),float(absolute_cut2)) if outtree != None: with open(DIR+i+".tt","w") as outfile: outfile.write(newick3.tostring(outtree)+";\n") assert filecount > 0, \ "No file end with "+tree_file_ending+" found in "+DIR
if first_line: first_line = False column_labels = parts continue if len(parts) > 1: # print parts[1:] data[parts[0]] = dict(zip(column_labels[1:],[float(p) for p in parts[1:]])) args.node_values[0].close() # load the tree tree = None while tree == None and line != "": line = args.tree[0].readline() try: tree = newick3.parse(StringIO(line)) except AttributeError: continue if tree == None: sys.exit("Could not find a tree in: " + args.tree[0].name) args.tree[0].close() # now we will paint the branches if args.node_values != None and args.label != None: # use values from the node-values file for column_label in args.label: for node in tree.iternodes(): if node.label in data: this_bin = get_bin(data[node.label][column_label], color_bins) if this_bin != None:
help='the tree.') parser.add_argument('-q', '--partitions', type=open, required=True, \ help='the location of the raxml partitions file corresponding to the alignment to be subsampled.') parser.add_argument('-x', '--random-seed', type=int, required=False, \ help='an integer seed for the random number generator function') parser.add_argument('-f', '--reduction-factor', type=float, required=False, \ help='a decimal value specifying how sparse to make the subsampling. the number of taxa that will be subsampled will be reduced proportionally to this value, thus a value of f 0.5 leads to a reduction by (approximately) half') parser.add_argument('-n', '--output-label', required=False, default='', \ help='a label to be attached to output files') args = parser.parse_args() a = Alignment(args.alignment, args.partitions) t = newick3.parse(args.tree) f = args.reduction_factor if 'reduction_factor' in args else 1 s = PhylogeneticSubsampler(alignment=a, tree=t, rates=args.rates, reduction_factor=f) s.subsample() s.write_subsampled_output(args.output_label) s.report_sampled_partitions() print('files have been written to:\n' + s.output_label + '.sampling_matrix.txt\n' + s.output_label + '.phy\n' + s.output_label + '.partitions.txt\n' \ 'sampling proportion is ' + str(s.get_sampling_proportion()))
first_line = False column_labels = parts continue if len(parts) > 1: # print parts[1:] data[parts[0]] = dict( zip(column_labels[1:], [float(p) for p in parts[1:]])) args.node_values[0].close() # load the tree tree = None while tree == None and line != "": line = args.tree[0].readline() try: tree = newick3.parse(StringIO(line)) except AttributeError: continue if tree == None: sys.exit("Could not find a tree in: " + args.tree[0].name) args.tree[0].close() # now we will paint the branches if args.node_values != None and args.label != None: # use values from the node-values file for column_label in args.label: for node in tree.iternodes(): if node.label in data: this_bin = get_bin(data[node.label][column_label], color_bins)
#!/usr/bin/env python if __name__ == '__main__': import newick3, phylo3, sys if len(sys.argv) < 2: print "usage: print_tip_names <treefile>" sys.exit() treefname = sys.argv[1] treefile = open(treefname, "r") for line in treefile: tree = newick3.parse(line) print(newick3.to_string(tree, use_node_labels=False))
def mask_paraphyletic_tips(curroot,ignore=[]): going = True while going and curroot != None and len(curroot.leaves()) >= 4: going = False for node in curroot.iternodes(): #walk through nodes if not node.istip: continue #only look at tips name = get_name(node.label).split("_")[1] parent = node.parent if node == curroot or parent == curroot or parent == None: continue #no paraphyletic tips for the root for para in parent.get_sisters(): if para.istip and name==get_name(para.label).split("_")[1]: # mask node = para.prune() if len(curroot.leaves()) >= 4: if (node==curroot and node.nchildren==2) or (node!=curroot and node.nchildren==1): node,curroot = remove_kink(node,curroot) going = True break return curroot if __name__ == "__main__": if len(sys.argv) != 3: print "usage: python "+sys.argv[0]+" treefile para(y/n)" sys.exit() intree = newick3.parse(open(sys.argv[1],"r").readline()) masked = mask_monophyletic_tips(intree,ignore=[]) if sys.argv[2] == "y": masked = mask_paraphyletic_tips(masked,ignore=[]) print newick3.tostring(masked)+";\n"
db_cursor.execute("SELECT name FROM taxonomy WHERE left_value > ? AND " \ "right_value < ? AND name_class == 'scientific name' AND node_rank LIKE ?", (left_value, right_value, rank.strip())) return [row[0] for row in db_cursor.fetchall()] # for row in db_cursor.fetchall(): # print row # return [] if len(sys.argv) < 4: print("usage: fill_tips_from_taxonomy.py <treefile> <taxonomydb> <outfile>") sys.exit(0) tree = None with open(sys.argv[1],"r") as intree_file: tree = newick3.parse(intree_file.readline()) conn = sqlite3.connect(sys.argv[2]) c = conn.cursor() for tip in tree.leaves(): for child_name in get_child_taxa(tip.label, "genus", c): child = phylo3.Node() child.label = child_name child.istip = True # print child.label tip.add_child(child) # print([t.label for t in tip.children]) with open(sys.argv[3],"w") as outfile: # print(newick3.to_string(tree)+";")
def simulate_random_rates(tree_label, tree_function, branch_lengths_function): indelible_control_file_text = """\ [TYPE] NUCLEOTIDE 2 // nucleotide simulation using algorithm from method 2. [MODEL] gtr1 [submodel] GTR {models[0]} [statefreq] {statefreqs[0]} [MODEL] gtr2 [submodel] GTR {models[1]} [statefreq] {statefreqs[1]} [MODEL] gtr3 [submodel] GTR {models[2]} [statefreq] {statefreqs[2]} [MODEL] gtr4 [submodel] GTR {models[3]} [statefreq] {statefreqs[3]} [MODEL] gtr5 [submodel] GTR {models[4]} [statefreq] {statefreqs[4]} [MODEL] gtr6 [submodel] GTR {models[5]} [statefreq] {statefreqs[5]} [MODEL] gtr7 [submodel] GTR {models[6]} [statefreq] {statefreqs[6]} [MODEL] gtr8 [submodel] GTR {models[7]} [statefreq] {statefreqs[7]} [MODEL] gtr9 [submodel] GTR {models[8]} [statefreq] {statefreqs[8]} [MODEL] gtr10 [submodel] GTR {models[9]} [statefreq] {statefreqs[9]} [TREE] tree1 {tree} [treelength] {tree_lengths[0]} [TREE] tree2 {tree} [treelength] {tree_lengths[1]} [TREE] tree3 {tree} [treelength] {tree_lengths[2]} [TREE] tree4 {tree} [treelength] {tree_lengths[3]} [TREE] tree5 {tree} [treelength] {tree_lengths[4]} [TREE] tree6 {tree} [treelength] {tree_lengths[5]} [TREE] tree7 {tree} [treelength] {tree_lengths[6]} [TREE] tree8 {tree} [treelength] {tree_lengths[7]} [TREE] tree9 {tree} [treelength] {tree_lengths[8]} [TREE] tree10 {tree} [treelength] {tree_lengths[9]} [PARTITIONS] part1 [tree1 gtr1 {part_length}] [PARTITIONS] part2 [tree2 gtr2 {part_length}] [PARTITIONS] part3 [tree3 gtr3 {part_length}] [PARTITIONS] part4 [tree4 gtr4 {part_length}] [PARTITIONS] part5 [tree5 gtr5 {part_length}] [PARTITIONS] part6 [tree6 gtr6 {part_length}] [PARTITIONS] part7 [tree7 gtr7 {part_length}] [PARTITIONS] part8 [tree8 gtr8 {part_length}] [PARTITIONS] part9 [tree9 gtr9 {part_length}] [PARTITIONS] part10 [tree10 gtr10 {part_length}] [EVOLVE] part1 1 {tree_label}_part_1 // 1 replicate generated from partition 'part1' in file '{tree_label}_part_1.fas' part2 1 {tree_label}_part_2 // 1 replicate generated from partition 'part2' in file '{tree_label}_part_2.fas' part3 1 {tree_label}_part_3 // 1 replicate generated from partition 'part3' in file '{tree_label}_part_3.fas' part4 1 {tree_label}_part_4 // 1 replicate generated from partition 'part4' in file '{tree_label}_part_4.fas' part5 1 {tree_label}_part_5 // 1 replicate generated from partition 'part5' in file '{tree_label}_part_5.fas' part6 1 {tree_label}_part_6 // 1 replicate generated from partition 'part6' in file '{tree_label}_part_6.fas' part7 1 {tree_label}_part_7 // 1 replicate generated from partition 'part7' in file '{tree_label}_part_7.fas' part8 1 {tree_label}_part_8 // 1 replicate generated from partition 'part8' in file '{tree_label}_part_8.fas' part9 1 {tree_label}_part_9 // 1 replicate generated from partition 'part9' in file '{tree_label}_part_9.fas' part10 1 {tree_label}_part_10 // 1 replicate generated from partition 'part10' in file '{tree_label}_part_10.fas' """ # simulation parameters # min_transition_rate = 0.5 #birth_rate / 100 # max_transition_rate = 1.5 #birth_rate / 10 # min_state_freq = 0.1 # max_state_freq = 0.3 part_length = 500 n_parts = 10 aln_length = part_length * n_parts # repeat until we get an acceptable tree while True: # randomly generate a tree, and calculate a scalar based on its branch lengths/depth tree_string = tree_function(branch_lengths_function) t = newick3.parse(StringIO(tree_string)) tree_depth = t.depth brlens = t.branch_lengths() x = len(brlens) median_brlen = brlens[ math.floor(x / 2) + (x % 2)] # scaled_tree_length = t.length * (maximal_scaled_brlen / median_brlen) models = [] tree_lengths = [] model_rates = {} statefreqs = [] for j in range(n_parts): # m = [] # t = [] # for k in range(5): #range(6): # # # first generate a transition rate # g = -1 ## while g < min_transition_rate or g > max_transition_rate: # g = random.random() + 0.5 # m.append(g) # # # perturb the model more # scalar = random.randint(2,20)/float(10) # m = [c*scalar for c in m] # random.shuffle(m) # models.append(" ".join([str(v) for v in m])) # model_rates['p'+str(j)] = sum(m) # # for k in range(3): # # now generate a state frequency # g = -1 # while g < min_state_freq or g > max_state_freq: # g = random.random() # t.append(g) # # # calculate the final value # t.append(str(1 - sum(t))) # random.shuffle(t) # statefreqs.append(" ".join([str(v) for v in t])) slowdown_scalar = 1 scaled_length = t.subtree_length / ((j+1) * slowdown_scalar) tree_lengths.append(scaled_length) x = [0.3,0.4,0.5,0.7,0.9,1] random.shuffle(x) models.append(" ".join([str(v) for v in x])) model_rates['p'+str(j)] = sum(x) * scaled_length y = [0.15,0.2,0.3,0.35] random.shuffle(y) statefreqs.append(" ".join([str(v) for v in y])) # save the tree topology to a file with open(tree_label + ".tre", "w") as tree_file: tree_file.write(tree_string) # tree_file.write(";") # write a control file for indelible with open("control.txt","w") as control_file: control_file.write(indelible_control_file_text.format(tree='('+tree_string.strip(';')+');', models=models, \ statefreqs=statefreqs, part_length=part_length, tree_label=tree_label, tree_lengths=tree_lengths)) # simulate data on the tree p = subprocess.Popen("indelible", stdout=subprocess.PIPE) r = p.communicate() # print(r) # exit() if not "ERROR in [TREE] block" in str(r[0]): # there was no error (substring index == -1) so move on break # combine the alignments and produce a partitions file aln = {} for j in range(n_parts): with open(tree_label+"_part_"+str(j+1)+"_TRUE.phy","r") as p: data = read_phylip(p) for name, seq in data.items(): if name not in aln: aln[name] = "" aln[name] += seq alignment_file_name = "%s_combined_aln.phy" % tree_label with open(alignment_file_name,"w") as alignment_file: alignment_file.write(str(n_tips_per_tree) + " " + str(aln_length) + "\n") for name, seq in aln.items(): alignment_file.write(name + " " + seq + "\n") partitions_file_name = "%s_combined_part.txt" % tree_label with open(partitions_file_name,"w") as partitions_file: for j in range(n_parts): partitions_file.write("DNA, p{j} = {begin}-{end}\n".format(j=j, begin=j*part_length+1, end=j*part_length+part_length)) # return the tree file, alignment file, and partitions file names return (tree_label + ".tre", alignment_file_name, partitions_file_name, model_rates)
if __name__ == "__main__": if len(sys.argv) != 3: print "python cut_long_branches_iter.py inDIR outDIR >log" sys.exit(0) DIR = sys.argv[1] + "/" for i in os.listdir(DIR): #go through fasta files in the input directory if i[-9:] != ".fasttree": continue fasta_file = DIR + i.replace(".aln-cln.fasttree", "") tree_file = DIR + i with open(tree_file, "r") as infile: first_line = infile.readline() #there's only 1 tree in each file if first_line.strip() == "": continue #empty file after trimming intree = newick3.parse(first_line.replace("-", "_")) if count_ingroups(intree) < MIN_INGROUP_TAXA: continue #skip trees with few ingroups ccID = i.split(".")[0] #looks like cc9 trees = [intree] #if intree has no long branches at all jus use the original fasta and alignment longest_branch_length = 0.0 for node in intree.iternodes(): if node != intree: longest_branch_length = max(longest_branch_length, node.length) if longest_branch_length < cutoff2: os.system("cp " + fasta_file + " " + fasta_file.replace(".fa", ".to_prank.fa")) os.system("cp " + fasta_file + ".aln-cln " + fasta_file.replace(".fa", ".to_prank.fa.aln-cln"))
if (len(bipart0[0].intersection(bipart1[0])) > 0 and len(bipart1[1].intersection(bipart0[0])) == 0 and \ len(bipart0[1].intersection(bipart1[1])) > 0 and len(bipart1[0].intersection(bipart0[1])) == 0) or \ (len(bipart0[0].intersection(bipart1[1])) > 0 and len(bipart1[0].intersection(bipart0[0])) == 0 and \ len(bipart0[1].intersection(bipart1[0])) > 0 and len(bipart1[1].intersection(bipart0[1])) == 0): return True else: return False if __name__ == "__main__": if len(sys.argv) < 3: sys.exit( "usage: root_tree_against_master.py <treetoroot> <mastertree>") with open(sys.argv[1], "r") as target_file: target = newick3.parse(target_file) if len(target.leaves()) < 3: sys.exit( "error: cannot perform rooting on a tree with fewer than three tips" ) with open(sys.argv[2], "r") as master_file: master = newick3.parse(master_file) if len(master.leaves()) < 3: sys.exit( "error: cannot perform rooting against a master tree with fewer than three tips" ) labels_missing_from_master = get_labels(target.leaves()).difference( get_labels(master.leaves())) if len(labels_missing_from_master) > 0:
#!/usr/bin/env python if __name__ == '__main__': import newick3, phylo3, numpy, sys if len(sys.argv) < 3: print "usage: remove_badtips.py <treefile> <maxdevfactor> [keep=\"<tax1, tax2, ...>\"]" sys.exit() treefname = sys.argv[1] treefile = open(treefname, "r") tree = newick3.parse(treefile.readline()) maxdevfactor = int(sys.argv[2]) if len(sys.argv) > 3: keepnames_str = sys.argv[3].split("keep=",1)[1] keepnames = [n.strip() for n in keepnames_str.split(",")] else: keepnames = [] lengths = [t.length for t in tree.leaves()] avg = numpy.mean(lengths) for tip in tree.leaves(): if tip.parent == tree: continue if tip.length > avg * maxdevfactor:
return cleaned_alignment+".tre" if __name__ == "__main__": if len(sys.argv) != 3: print "python cut_long_branches_iter.py inDIR outDIR >log" sys.exit(0) DIR = sys.argv[1]+"/" for i in os.listdir(DIR): #go through fasta files in the input directory if i[-9:] != ".fasttree": continue fasta_file = DIR+i.replace(".aln-cln.fasttree","") tree_file = DIR+i with open(tree_file,"r") as infile: first_line = infile.readline() #there's only 1 tree in each file if first_line.strip() == "": continue #empty file after trimming intree = newick3.parse(first_line.replace ("-","_")) if count_ingroups(intree) < MIN_INGROUP_TAXA: continue #skip trees with few ingroups ccID = i.split(".")[0] #looks like cc9 trees = [intree] #if intree has no long branches at all jus use the original fasta and alignment longest_branch_length = 0.0 for node in intree.iternodes(): if node != intree: longest_branch_length = max(longest_branch_length,node.length) if longest_branch_length < cutoff2: os.system("cp "+fasta_file+" "+fasta_file.replace(".fa",".to_prank.fa")) os.system("cp "+fasta_file+".aln-cln "+fasta_file.replace(".fa",".to_prank.fa.aln-cln")) continue
required=True, help='The list of names to be added to the tree.') # allow a min branch length to be specified. it must be parseable as a float parser.add_argument('-b', '--min-branch-length', type=float, nargs=1, \ required=False, help='The minimum branch length to be used.') # record a boolean value of True if this argument is set parser.add_argument('-s', '--include-stem', action='store_true', \ required=False, help='Pass this argument to allow newly added species to be ' 'attached to the root of the tree.') args = parser.parse_args() # attempt to parse the input tree try: tree = newick3.parse(args.input_tree[0]) except Exception as e: print("There was a problem parsing the input tree: " + e.message) exit(1) # extract the names from the names file names = [n.strip() for n in args.names[0]] # use the user-specified min branch length if specified min_branch_length = args.min_branch_length[0] if args.min_branch_length is not None \ else MIN_BRANCH_LENGTH # assign the function that will be used to gather the nodes--the iternodes function # will include the root, but the descendants function will not get_nodes = phylo3.Node.iternodes if args.include_stem else phylo3.Node.descendants
if __name__ == '__main__': import newick3, phylo3, sys if len(sys.argv) < 2: print "usage: excise_knuckles.py <treefile>" sys.exit(0) treefname = sys.argv[1] treefile = open(treefname, "r") logfile = open("excise_knuckles.log", "w") for line in treefile: tree = newick3.parse(line) while len(tree.children) < 2: # prune knuckles at the root of the tree if necessary only_child = tree.children[0] only_child.parent = None only_child.isroot = True tree = only_child # cannot edit tree while traversing, so just record knuckles as we go knuckles = [] # first find the knuckles for parent in tree.iternodes(phylo3.PREORDER): if parent.istip:
return [get_name(i) for i in labels] if __name__ == "__main__": if len(sys.argv) != 2: print "usage: python ortholog_occupancy_stats.py ortho_treDIR" sys.exit(0) DIR = sys.argv[1]+"/" outfile = open("ortho_stats","w") DICT = {} #key is taxon name, value is how many orthologs it is in total_ortho = 0 for i in os.listdir(DIR): if i[-len(file_ending):] == file_ending and "ortho" in i: print i total_ortho += 1 with open(DIR+i,"r") as infile: intree = newick3.parse(infile.readline()) names = get_front_names(intree) for taxon in names: if taxon not in DICT: DICT[taxon] = 0 DICT[taxon] += 1 outfile.write(str(len(names))+"\n") outfile.close() print "number of taxa in each ortholog written to ortho_stats" with open("taxon_stats","w") as outfile: outfile.write("taxonID\tnum_ortho\t%ortho_out_of_total_"+str(total_ortho)+"\n") for taxon in DICT: outfile.write(taxon+"\t"+str(DICT[taxon])+"\t"+str(float(DICT[taxon])/total_ortho)+"\n") print "number of ortholog for each taxon written to taxon_stats"
elif argname == "include": cladenames = [n.strip() for n in argval.split(",")] elif argname == "includefile": includenamesfile = open(argval, "r") cladenames = [n.strip() for n in includenamesfile.readlines()] includenamesfile.close() # elif argname == "exclude": # exclude_names = [n.strip() for n in argval.split(",")] assert (len(cladenames) > 0) assert (target_rank != "") test_tree = newick3.parse(open(treefname, "r")) print "will assess monophyly for taxa of rank '" + target_rank + "'" con = sqlite3.connect(dbname) cur = con.cursor() included_taxa = {} for name in cladenames: cur.execute( "SELECT name, left_value, right_value FROM taxonomy WHERE name_class == 'scientific name' AND name LIKE ?", (name, )) for curname, leftval, rightval in cur.fetchall(): print "including " + name included_taxa[name] = (leftval, rightval)
def RT(homoDIR, tree_file_eneding, outDIR, min_ingroup_taxa, taxon_code_file_file): if homoDIR[-1] != "/": homoDIR += "/" if outDIR[-1] != "/": outDIR += "/" min_ingroup_taxa = int(min_ingroup_taxa) INGROUPS = [] OUTGROUPS = [] with open(taxon_code_file_file, "r") as infile: for line in infile: if len(line) < 3: continue spls = line.strip().split("\t") if spls[0] == "IN": INGROUPS.append(spls[1]) elif spls[0] == "OUT": OUTGROUPS.append(spls[1]) else: print "Check taxon_code_file file format" sys.exit() if len(set(INGROUPS) & set(OUTGROUPS)) > 0: print "Taxon ID", set(INGROUPS) & set( OUTGROUPS), "in both ingroups and outgroups" sys.exit(0) print len(INGROUPS), "ingroup taxa and", len( OUTGROUPS), "outgroup taxa read" print "Ingroups:", INGROUPS print "Outgroups:", OUTGROUPS for treefile in os.listdir(homoDIR): if not treefile.endswith(tree_file_eneding): continue with open(homoDIR + treefile, "r") as infile: intree = newick3.parse(infile.readline()) curroot = intree all_names = tree_utils.get_front_names(curroot) num_tips = len(all_names) num_taxa = len(set(all_names)) print treefile #check taxonIDs ingroup_names = [] outgroup_names = [] for name in all_names: if name in INGROUPS: ingroup_names.append(name) elif name in OUTGROUPS: outgroup_names.append(name) else: print name, "not in ingroups or outgroups" sys.exit() if len(set(ingroup_names)) < min_ingroup_taxa: print "not enough ingroup taxa in tree" continue outID = outDIR + tree_utils.get_clusterID(treefile) if len(outgroup_names ) > 0: #at least one outgroup present, root and cut inclades inclades = tree_utils.extract_rooted_ingroup_clades(curroot,\ INGROUPS,OUTGROUPS,min_ingroup_taxa) inclade_count = 0 for inclade in inclades: inclade_count += 1 inclade_name = outID + ".inclade" + str(inclade_count) with open(inclade_name, "w") as outfile: outfile.write(newick3.tostring(inclade) + ";\n") orthologs = tree_utils.get_ortho_from_rooted_inclade(inclade) ortho_count = 0 for ortho in orthologs: if len(tree_utils.get_front_labels( ortho)) >= min_ingroup_taxa: ortho_count += 1 with open( inclade_name + ".ortho" + str(ortho_count) + ".tre", "w") as outfile: outfile.write(newick3.tostring(ortho) + ";\n") elif len(all_names) == num_taxa: #only output ortho tree when there is no taxon repeats with open(outID + ".unrooted-ortho.tre", "w") as outfile: outfile.write(newick3.tostring(curroot) + ";\n") else: #do not attempt to infer direction of gene duplication without outgroup info print "duplicated taxa in unrooted tree"
parser.add_argument('-q', '--partitions', type=open, required=True, \ help='the location of the raxml partitions file corresponding to the alignment to be subsampled.') parser.add_argument('-x', '--random-seed', type=int, required=False, \ help='an integer seed for the random number generator function') parser.add_argument('-f', '--reduction-factor', type=float, required=False, \ help='a decimal value specifying how sparse to make the subsampling. the number of taxa that will be subsampled will be reduced proportionally to this value, thus a value of f 0.5 leads to a reduction by (approximately) half') parser.add_argument('-n', '--output-label', required=False, default='', \ help='a label to be attached to output files') args = parser.parse_args() a = Alignment(args.alignment, args.partitions) t = newick3.parse(args.tree) f = args.reduction_factor if 'reduction_factor' in args else 1 s = PhylogeneticSubsampler(alignment=a, tree=t, rates=args.rates, reduction_factor=f) s.subsample() s.write_subsampled_output(args.output_label) s.report_sampled_partitions() print('files have been written to:\n' + s.output_label + '.sampling_matrix.txt\n' + s.output_label + '.phy\n' + s.output_label + '.partitions.txt\n' \ 'sampling proportion is ' + str(s.get_sampling_proportion()))
elif argname == "include": cladenames = [n.strip() for n in argval.split(",")] elif argname == "includefile": includenamesfile = open(argval,"r") cladenames = [n.strip() for n in includenamesfile.readlines()] includenamesfile.close() # elif argname == "exclude": # exclude_names = [n.strip() for n in argval.split(",")] assert(len(cladenames) > 0) assert(target_rank != "") test_tree = newick3.parse(open(treefname,"r")) print "will assess monophyly for taxa of rank '" + target_rank + "'" con = sqlite3.connect(dbname) cur = con.cursor() included_taxa = {} for name in cladenames: cur.execute("SELECT name, left_value, right_value FROM taxonomy WHERE name_class == 'scientific name' AND name LIKE ?",(name,)) for curname, leftval, rightval in cur.fetchall(): print "including " + name included_taxa[name] = (leftval, rightval) out_prefix = "_".join(included_taxa) if len(out_prefix) > 40:
#!/usr/bin/env python if __name__ == '__main__': import newick3, os, sys if len(sys.argv) < 3: sys.exit("usage: find_names_missing_from_tree.py <datafile> <treefile>") data_file_name = sys.argv[1] tree_file_name = sys.argv[2] names = set() with open(tree_file_name, "r") as treefile: tree = newick3.parse(treefile.readline()) for l in tree.leaves(): names.add(l.label) with open(data_file_name, "r") as datafile: for line in datafile: parts = line.split() if len(parts) > 1 and parts[0] not in names: print parts[0]
if len(parts) > 1: aln[parts[0]] = parts[1] args.alignment[0].close() # get the tree to subsample tree = None treefile = args.tree[0] print("reading tree from " + treefile.name) line = None while line != "": line = treefile.readline() if len(line.strip()) < 1: continue # print(line) # try: tree = newick3.parse(StringIO(line)) # print('found tree') # print(tree) break # except AttributeError: # pass if tree == None: sys.exit("Could not find a tree in the treefile: " + treefile.name) args.tree[0].close() leaves = tree.leaves() calc_stop_k = args.stop_node_number[ 0] if args.stop_node_number != None else len(tree.leaves()) + 100 if calc_stop_k < calc_start_k: sys.exit( "The start node number is higher than the stop node number, designating no nodes for processing."
if len(parts) > 1: aln[parts[0]] = parts[1] args.alignment[0].close() # get the tree to subsample tree = None treefile = args.tree[0] print("reading tree from " + treefile.name) line = None while line != "": line = treefile.readline() if len(line.strip()) < 1: continue # print(line) # try: tree = newick3.parse(StringIO(line)) # print('found tree') # print(tree) break # except AttributeError: # pass if tree == None: sys.exit("Could not find a tree in the treefile: " + treefile.name) args.tree[0].close() leaves = tree.leaves() calc_stop_k = args.stop_node_number[0] if args.stop_node_number != None else len(tree.leaves())+100 if calc_stop_k < calc_start_k: sys.exit("The start node number is higher than the stop node number, designating no nodes for processing.") if args.verbose:
#!/usr/bin/env python import newick3, os, phylo3, png, sys size_scalar = 4 breakup = 8 if __name__ == "__main__": if len(sys.argv) != 4: print "usage: make_matrix_fig.py infile.csv infile.tre outfile.png" sys.exit(0) infile = open(sys.argv[1],"r") infiletree = open(sys.argv[2],"r") tree = newick3.parse(infiletree.readline()) order_map = {} count = 0 for i in tree.leaves(): order_map[i.label] = count count += 1 print str(len(order_map)) + " tips in tree" infiletree.close() first = True sampling_array = ['']*len(order_map) # set colors palette=[(0xff,0xff,0xff), (0x00,0x00,0x00)]
required=True, help='The list of names to be added to the tree.') # allow a min branch length to be specified. it must be parseable as a float parser.add_argument('-b', '--min-branch-length', type=float, nargs=1, \ required=False, help='The minimum branch length to be used.') # record a boolean value of True if this argument is set parser.add_argument('-s', '--include-stem', action='store_true', \ required=False, help='Pass this argument to allow newly added species to be ' 'attached to the root of the tree.') args = parser.parse_args() # attempt to parse the input tree try: tree = newick3.parse(args.input_tree[0]) except Exception as e: print("There was a problem parsing the input tree: " + e.message) exit(1) # extract the names from the names file names = [n.strip() for n in args.names[0]] # use the user-specified min branch length if specified min_branch_length = args.min_branch_length[0] if args.min_branch_length is not None \ else MIN_BRANCH_LENGTH # assign the function that will be used to gather the nodes--the iternodes function # will include the root, but the descendants function will not get_nodes = phylo3.Node.iternodes if args.include_stem else phylo3.Node.descendants
def run_single_tree(base_dir, simulation_function, tree_function, tree_number, branch_lengths_function=None, subsampling_function=None): base_dir += "_" + str(n_random_trees) + "_trees_of_" + str(n_tips_per_tree) + "_tips" if subsampling_function is not None: base_dir += "_SUBSAMPLED_" + subsampling_function.__doc__ if not os.path.exists(base_dir): sys.exit("need to initialize the model dir first") final_scores_dir = base_dir + "/all_scores" all_scores_file_name = final_scores_dir + "/ALL.scores.csv" all_times_file_name = final_scores_dir + "/ALL.times.csv" os.chdir(base_dir) tree_label = "tree" + str(tree_number) if os.path.exists(tree_label): shutil.rmtree(tree_label) working_tree_dir = os.getcwd() + "/" + tree_label os.mkdir(working_tree_dir) os.chdir(working_tree_dir) # run the tree and alignment simulation, get the results simulation_results = simulation_function(tree_label, tree_function, branch_lengths_function) tree_file_name = working_tree_dir + "/" + simulation_results[0] alignment_file_name = working_tree_dir + "/" + simulation_results[1] partitions_file_name = working_tree_dir + "/" + simulation_results[2] if len(simulation_results) > 2 else None model_rates = simulation_results[3] if len(simulation_results) > 3 else None # subsample the alignment if necessary if subsampling_function is not None: alignment_file_name = subsampling_function(alignment_file_name, partitions_file_name, model_rates, tree_file_name) # calculate support for the original tree based on the data, support values will be stored in node_scores.csv subsample_args = ["python3", os.path.expanduser("~/scripts/subsample_edge_quartets.py"), "-t", tree_file_name, "-n", alignment_file_name, "-#", n_reps_taxon_jackknife, "-T", n_threads, "-e", temp_dir, "-o", working_tree_dir, "-X", raxml_executable] if partitions_file_name is not None: subsample_args += ["-q", partitions_file_name] # print ' '.join(subsample_args) # exit() start = time.time() subprocess.call(subsample_args) jackknife_time = time.time() - start print("using: " + temp_dir + " for temp files") shutil.rmtree(temp_dir) os.mkdir(temp_dir) print("\ntime for taxon jackknife: %.2f seconds" % jackknife_time) with open(all_times_file_name,"a") as timefile: timefile.write("%s,%.2f," % (tree_number, jackknife_time)) # get the node scores from the taxon jackknife node_scores = {} print(os.path.abspath(".")) with open("node_scores.csv","r") as node_scores_file: first_line = True for line in node_scores_file: if first_line: first_line = False continue parts = [p.strip() for p in line.split(",")] if len(parts) < 3: continue node_scores[parts[0]] = {"j_freq":parts[1],"j_ica":parts[2],"in_true_tree":"TRUE"} # make a painted tree for reviewing the taxon jackknife results paint_args = ["paint_branches.py", "-t", "RESULT.labeled.tre", "-c", os.path.expanduser("~/scripts/figtree_color_palettes/blue_gray_red_1_to_neg_1.csv"), "-l", "ica", "-d", "node_scores.csv", "-n", tree_label] subprocess.call(paint_args) # calculate bootstrap values for the original tree based on the data raxml_bootstrap_args = [raxml_pthreads_executable, "-n", tree_label, "-s", alignment_file_name, "-#", n_reps_bootstrap, "-x", "123", "-p", "123", "-m", "GTRCAT", "-T", n_threads_bootstrap] if partitions_file_name is not None: subsample_args += ["-q", partitions_file_name] start = time.time() subprocess.call(raxml_bootstrap_args) bootstrap_time = time.time() - start print("\ntime for bootstrap: %.2f seconds" % bootstrap_time) with open(all_times_file_name,"a") as timefile: timefile.write("%.2f\n" % bootstrap_time) # call pxbp to generate bipart frequencies from bootstrap replicates pxbp_args = ["pxbp", "-t", "RAxML_bootstrap." + tree_label] p = subprocess.Popen(pxbp_args, stdout=subprocess.PIPE) results = p.communicate()[0] # parse pxbp results bootstrap_biparts = {} for line in results.decode().split('\n'): # print(line) parts = [p.strip() for p in line.split("\t")] if len(parts) < 2: continue ingroup_labels = tuple(sorted(parts[0].split())) freq = None ica = None if parts[1].strip() == "1": freq = "1" ica = "1" else: freq = parts[2] ica = parts[4] bootstrap_biparts[ingroup_labels] = (freq, ica) # print(bootstrap_biparts) # exit() # TODO: call mrbayes to generate a bayesian posterior distribution # load RESULT.labeled.tre labeled_tree = None true_tree_biparts = set() with open("RESULT.labeled.tre","r") as labeled_tree_file: labeled_tree = newick3.parse(labeled_tree_file.readline()) # for each node in the labeled tre leaves = labeled_tree.leaves() for n in labeled_tree.iternodes(): # skip the root and the tip nodes if n.parent == None or n.istip: continue # special case: if root position is adjacent to a tip if n.label not in node_scores and n.parent.parent == None: # in this case, the bipart is always defined--if you remove the root then the # bipart is the tip vs. the rest of the tree, so the taxon jackknife won't score # it (and its label won't show up in the node scores). continue true_tree_biparts.add(tuple(sorted([t.label for t in n.leaves()]))) # record the branch length node_scores[n.label]["length"] = n.length # record the depth c = n d = 0 while True: c = c.children[0] d += c.length if c.istip: break node_scores[n.label]["depth"] = d # see if this node's bipart is represented in the bootstraps, if so get the frequency and the ica score s = tuple(sorted([t.label for t in n.leaves()])) if s in bootstrap_biparts: node_scores[n.label]["b_freq"] = bootstrap_biparts[s][0] node_scores[n.label]["b_ica"] = bootstrap_biparts[s][1] else: node_scores[n.label]["b_freq"] = "0" node_scores[n.label]["b_ica"] = "NA" # process all bootstrap trees, recording info for all branches not in the true tree, so we can calc ica for them bootstrap_trees = [] with open("RAxML_bootstrap." + tree_label, "r") as bootstrap_file: for line in bootstrap_file: bootstrap_trees.append(newick3.parse(line)) # pull out all the taxon quartets defined by branches in all bootstrap trees bootstrap_quartets = set() leaves = labeled_tree.leaves() all_taxon_labels = set([l.label for l in leaves]) for bootstrap_tree in bootstrap_trees: for node in bootstrap_tree.iternodes(): # skip tips and the root node itself if node.istip or node.parent == None: continue # determine if this branch is represented in the true tree ingroup_labels = tuple(sorted([t.label for t in node.leaves()])) process_branch = False if ingroup_labels not in true_tree_biparts: remaining_labels = tuple(sorted(all_taxon_labels - set(ingroup_labels))) if remaining_labels not in true_tree_biparts: process_branch = True # if this branch isn't in the true tree, make a tree we can use to evaluate ica for it as well if process_branch: ########### modified from subsample_edge_quartets # get leaf sets for the four connected subtrees # two daughter subtrees r1 = set([node.children[0].label,] if node.istip else [l.label for l in node.children[0].leaves()]) r2 = set([node.children[1].label,] if node.istip else [l.label for l in node.children[1].leaves()]) # sibling/parent subtrees is_other_side_of_root = False # used when we hit the root for the second time skip_tip_child_of_root = False # used when one of the children of the root node is a tip tip_child_label = None for sib in node.parent.children: if sib != node: # if one of the subtrees is the root, skip over it if len(sib.leaves()) + len(node.leaves()) == len(leaves): # if we already processed this bipart (on other side of the root), don't do it again if (root_bipart_label != None): is_other_side_of_root = True break # get the subtrees opposite the root if len(sib.children) == 2: l1 = set([sib.children[0].label,] if sib.children[0].istip else [l.label for l in sib.children[0].leaves()]) l2 = set([sib.children[1].label,] if sib.children[1].istip else [l.label for l in sib.children[1].leaves()]) elif len(sib.children) == 0: skip_tip_child_of_root = True tip_child_label = sib.label else: print("Node %s does not have exactly 2 children. It will be skipped." % k) continue # remember that we've already done the root, so we can skip it when we hit the other side root_bipart_label = node.label # otherwise not at root, all connected subtrees have children else: # sibling subtree l1 = set([l.label for l in sib.leaves()]) # the rest of the tree l2 = set() for label in [l.label for l in leaves]: if label not in r1 and \ label not in r2 and \ label not in l1: l2.add(label) if skip_tip_child_of_root: print("not evaluating tip child '" + tip_child_label + "' of the root (ica is 1.0, as for all tips).") continue ######## end modified from subsample_edge_quartets.py q = (frozenset((tuple(sorted(l1)),tuple(sorted(l2)))),frozenset((tuple(sorted(r1)),tuple(sorted(r2))))) # make sure this quartet isn't already in the set in reverse before adding it if (q[1],q[0]) not in bootstrap_quartets: bootstrap_quartets.add(q) print("Found %s bootstrap quartets not in the original tree" % len(bootstrap_quartets)) # set the sampling interval so we only process max_bs_branches_to_process, evenly distributed across the input set sample_freq = len(bootstrap_quartets)/float(max_bs_branches_to_process) max_br = min(len(bootstrap_quartets),max_bs_branches_to_process) cur_br = 1 os.chdir(temp_dir) for p, q in enumerate(bootstrap_quartets): # skip branches that don't fall within our sample based on max_bs_branches_to_process if not p % sample_freq < 1: continue print("on bootstrap quartet %s out of %s" % (cur_br, max_br)) bad_branch_label = str(p) + "b" os.mkdir(bad_branch_label) cur_br += 1 tip_label_sets = [] with open(bad_branch_label+".tre","w") as topo_file: clades = [] for j in q: for k in j: tip_label_sets.append(k) for ls in tip_label_sets: text = ",".join(ls) if len(ls) > 1: text = "(" + text + ")" clades.append(text) newick = "((" + clades[0] + "," + clades[1] + "),(" + clades[2] + "," + clades[3] + "));" topo_file.write(newick) # calculate support for the bootstrap-inferred FALSE branch, support values will be stored in node_scores.csv subsample_args = ["python3", os.path.expanduser("~/scripts/subsample_edge_quartets.py"), "-t", bad_branch_label+".tre", "-n", alignment_file_name, "-#", n_reps_taxon_jackknife, "-T", n_threads, "-e", temp_dir + "/" + bad_branch_label, "-p", "1", # specify a stop node number that we only want to process the first (i.e. the root) node "-X", raxml_executable] subprocess.call(subsample_args) # extract the ica score for this node. we only care about the first bipart, so we read the second line in the file with open("node_scores.csv","r") as node_scores_file: parts = [t.strip() for t in node_scores_file.readlines()[1].split(",")] node_scores[bad_branch_label] = {"j_freq":parts[1],"j_ica":parts[2]} try: ingroup_labels = tuple(sorted(tip_label_sets[0] + tip_label_sets[1])) # print(ingroup_labels) # print(bootstrap_biparts) node_scores[bad_branch_label]["b_freq"] = bootstrap_biparts[ingroup_labels][0] except KeyError: ingroup_labels = tuple(sorted(tip_label_sets[2] + tip_label_sets[3])) node_scores[bad_branch_label]["b_freq"] = bootstrap_biparts[ingroup_labels][0] node_scores[bad_branch_label]["b_ica"] = bootstrap_biparts[ingroup_labels][1] node_scores[bad_branch_label]["in_true_tree"] = "FALSE" node_scores[bad_branch_label]["length"] = "NA" node_scores[bad_branch_label]["depth"] = "NA" shutil.rmtree(temp_dir + "/" + bad_branch_label) os.chdir(working_tree_dir) # write scores to file and prepare for next iteration with open(all_scores_file_name,"a") as scores_file: for node_label, values in node_scores.items(): scores_file.write(str(tree_number)+","+node_label) for c in score_file_column_labels: scores_file.write(","+str(values[c])) scores_file.write("\n")