Пример #1
0
def refine(query_fasta, start_fasta, deep_paralog_cutoff, num_cores):
    gene_name = get_filename_from_path(query_fasta)[1].split(".")[0]
    outdir, fasta = get_filename_from_path(start_fasta)
    #print outdir,fasta
    deep_paralog_cutoff = float(deep_paralog_cutoff)
    query_ids = [s.name for s in seq.read_fasta_file(query_fasta)]
    new_fasta = []  # list of output refined fasta files
    print outdir, fasta

    # make a tree from the start_fasta
    tree = fasta_to_tree.fasta_to_tree(outdir, fasta, num_cores, "aa")
    if tree == None: return []
    with open(tree, "r") as infile:
        intree = newick3.parse(infile.readline())
    root = trim_tips.trim(intree,
                          relative_cutoff=deep_paralog_cutoff,
                          absolute_cutoff=deep_paralog_cutoff * 2)
    if os.path.exists(outdir + fasta + ".pasta.aln-cln"):
        clnfile = outdir + fasta + ".pasta.aln-cln"
    else:
        clnfile = outdir + fasta + ".mafft.aln-cln"
    root = mask_tips_by_taxonID_transcripts.mask(root,\
     clnfile=clnfile,\
     para="y",
     ignore=GENOMES)
    if root != None:
        with open(tree + ".tt.mm", "w") as outfile:
            outfile.write(newick3.tostring(root) + "\n")
        subtrees = cut_long_internal_branches.cut_long_internal_branches(
            root, cutoff=deep_paralog_cutoff)
        count = 0
        base_name = fasta.split(".")[0]
        seqDICT = {}  # key is seqid, value is seq
        for s in seq.read_fasta_file(start_fasta):
            seqDICT[s.name] = s.seq
        for tree in subtrees:
            if tree == None: continue
            label_set = set(tree_utils.get_front_labels(tree))
            if len(label_set) > 4 and len(label_set & set(query_ids)) > 0:
                count += 1
                with open(outdir + base_name + "_" + str(count) + ".subtree",
                          "w") as outfile:
                    outfile.write(newick3.tostring(tree) + ";\n")
                with open(outdir + base_name + "_" + str(count) + ".fa",
                          "w") as outfile:
                    for seqid in tree_utils.get_front_labels(tree):
                        try:
                            outfile.write(">" + seqid + "\n" + seqDICT[seqid] +
                                          "\n")
                        except:
                            print seqid, "not found in fasta file"
                new_fasta.append(outdir + base_name + "_" + str(count) + ".fa")

    return new_fasta
Пример #2
0
def taxon_name_subst(original,table=sys.path[0]+"/reverse_taxon_table"):
	DICT = {} # key is seq acronym, value is full taxon name, separated by tab
	with open(table, "r") as infile:
		for line in infile:
			spls = line.strip().split("\t")
			if len(spls) > 1:
				DICT[spls[0].replace("|","_")] = spls[1]
	
	with open(original,"r") as infile:
		line = infile.readline()
		is_fasta = True if line[0] == ">" else False
	
	if is_fasta: # for fasta files
		infile = open(original,"r")
		outfile = open(original+".name","w")
		for line in infile:
			if line[0] == ">":
				outfile.write('>'+get_long_id(line.strip()[1:],DICT)+"\n")
			else: outfile.write(line)
		infile.close()
		outfile.close()
	else: # tree file
		with open(original,"r") as infile:
			intree = newick3.parse(infile.readline())
		for i in intree.leaves():
			print i.label,
			i.label = get_long_id(i.label,DICT)
			print i.label
		with open(original+".name","w") as outfile:
			outfile.write(newick3.tostring(intree)+";\n")
Пример #3
0
def main(treDIR,
         clnDIR,
         para,
         intree_file_ending=INTREE_FILE_ENDING,
         ignore=GENOMES):
    if treDIR[-1] != "/": treDIR += "/"
    if clnDIR[-1] != "/": clnDIR += "/"
    assert para == "y" or para == "n", "mask paraphyletic tips? (y/n)"
    mask_para = True if para == "y" else False
    filecount = 0

    filematch = {}  #key is clusterID, value is the .aln-cln file
    for i in os.listdir(clnDIR):
        if i.endswith(".aln-cln"):
            clusterID = get_clusterID(i)
            assert clusterID not in filematch, \
             "The clusterID "+clusterID+" repeats in "+clnDIR
            filematch[clusterID] = i

    for i in os.listdir(treDIR):
        if i.endswith(intree_file_ending):
            with open(treDIR + i, "r") as infile:
                intree = newick3.parse(infile.readline())
            print i
            clusterID = get_clusterID(i)
            filecount += 1
            curroot = mask(intree,
                           clnDIR + filematch[clusterID],
                           para=mask_para,
                           ignore=GENOMES)
            with open(treDIR + i + ".mm", "w") as outfile:
                outfile.write(newick3.tostring(curroot) + ";\n")
    assert filecount > 0, \
     "No file ends with "+intree_file_ending+" found in "+treDIR
def main(treDIR, clnDIR, para, intree_file_ending=INTREE_FILE_ENDING):
    if treDIR[-1] != "/": treDIR += "/"
    if clnDIR[-1] != "/": clnDIR += "/"
    assert para == "y" or para == "n", "mask paraphyletic tips? (y/n)"
    mask_para = True if para == "y" else False
    filecount = 0

    filematch = {}  #key is clusterID, value is the .aln-cln file
    for i in os.listdir(clnDIR):
        if i.endswith(".aln-cln"):
            clusterID = get_clusterID(i)
            assert clusterID not in filematch, \
             "The clusterID "+clusterID+" repeats in "+clnDIR
            filematch[clusterID] = i

    for i in os.listdir(treDIR):
        if i.endswith(intree_file_ending):
            with open(treDIR + i, "r") as infile:
                intree = newick3.parse(infile.readline())
            print i
            clusterID = get_clusterID(i)
            filecount += 1
            chrDICT = {}  #key is seqid, value is number of unambiguous chrs
            for s in read_fasta_file(clnDIR + filematch[clusterID]):
                for ch in ['-', 'X', "x", "?", "*"]:
                    s.seq = s.seq.replace(ch, "")  #ignore gaps, xs and Xs
                chrDICT[s.name] = len(s.seq)
            curroot = mask_monophyletic_tips(intree, chrDICT)
            if mask_para: curroot = mask_paraphyletic_tips(curroot, chrDICT)
            with open(treDIR + i + ".mm", "w") as outfile:
                outfile.write(newick3.tostring(curroot) + ";\n")
    assert filecount > 0, \
     "No file ends with "+intree_file_ending+" found in "+treDIR
def main(inDIR, file_ending, branch_len_cutoff, min_taxa, outDIR):
    """cut long branches and output subtrees as .subtre files
	if uncut and nothing changed betwee .tre and .subtree
	copy the original .tre file to the outdir"""
    if inDIR[-1] != "/": inDIR += "/"
    if outDIR[-1] != "/": outDIR += "/"
    min_taxa = int(min_taxa)
    filecount = 0
    cutoff = float(branch_len_cutoff)
    print "cutting branches longer than", cutoff
    for i in os.listdir(inDIR):
        if not i.endswith(file_ending): continue
        print i
        filecount += 1
        with open(inDIR + i, "r") as infile:  #only 1 tree in each file
            intree = newick3.parse(infile.readline())
        try:
            with open(inDIR + i[:i.find(".tre")] + ".tre",
                      "r") as infile:  #the original .tre
                raw_tree_size = len(
                    get_front_labels(newick3.parse(infile.readline())))
        except:  # did not refine this round. Use the .tre.tt.mm tree
            raw_tree_size = len(get_front_labels(intree))
        num_taxa = count_taxa(intree)
        if num_taxa < min_taxa:
            print "Tree has", num_taxa, "less than", min_taxa, "taxa"
        else:
            print ".tre:", raw_tree_size, "tips; " + file_ending + ": " + str(
                len(get_front_labels(intree))) + " tips"
            subtrees = cut_long_internal_branches(intree, cutoff)
            if len(subtrees) == 0:
                print "No tree with at least", min_taxa, "taxa"
            #elif raw_tree_size == len(subtrees[0].leaves()):
            #copy(inDIR+i,outDIR+i)
            #print "written to out directory unchanged"
            else:
                count = 0
                outsizes = ""
                for subtree in subtrees:
                    if count_taxa(subtree) >= min_taxa:
                        if subtree.nchildren == 2:  #fix bifurcating roots from cutting
                            temp, subtree = remove_kink(subtree, subtree)
                        count += 1
                        with open(
                                outDIR + i.split(".")[0] + "_" + str(count) +
                                ".subtree", "w") as outfile:
                            outfile.write(newick3.tostring(subtree) + ";\n")
                        outsizes += str(len(subtree.leaves())) + ", "
                print count, "tree(s) wirtten. Sizes:", outsizes
    assert filecount > 0, "No file end with " + file_ending + " in " + inDIR
Пример #6
0
def main(DIR, tree_file_ending, relative_cut, absolute_cut):
    if DIR[-1] != "/": DIR += "/"
    filecount = 0
    for i in os.listdir(DIR):
        if i.endswith(tree_file_ending):
            print i
            filecount += 1
            with open(DIR + i, "r") as infile:
                intree = newick3.parse(infile.readline())
            outtree = trim(intree, float(relative_cut), float(absolute_cut))
            if outtree != None:
                with open(DIR + i + ".tt", "w") as outfile:
                    outfile.write(newick3.tostring(outtree) + ";\n")
    assert filecount > 0, \
     "No file end with "+tree_file_ending+" found in "+DIR
Пример #7
0
def main(DIR,tree_file_ending,relative_cut,absolute_cut1,absolute_cut2):
	if DIR[-1] != "/": DIR += "/"
	filecount = 0
	for i in os.listdir(DIR):
		if i.endswith(tree_file_ending):
			print i
			filecount += 1
			with open(DIR+i,"r") as infile:
				intree = newick3.parse(infile.readline())
			outtree = trim(intree,float(relative_cut),float(absolute_cut1),float(absolute_cut2))
			if outtree != None:
				with open(DIR+i+".tt","w") as outfile:
					outfile.write(newick3.tostring(outtree)+";\n")
	assert filecount > 0, \
		"No file end with "+tree_file_ending+" found in "+DIR
Пример #8
0
import phylo3,newick3,os,sys
from tree_utils import *

if __name__ == "__main__":
	if len(sys.argv) != 4:
		print "python prune_paralogs_from_rooted_trees.py homoTreeDIR tree_file_ending minimal_taxa outDIR"
		sys.exit(0)
	
	inDIR = sys.argv[1]+"/"
	tree_file_ending = sys.argv[2]
	MIN_TAXA = int(sys.argv[3])
	outDIR = sys.argv[4]+"/"
	for i in os.listdir(inDIR):
		if not i.endswith(tree_file_ending) continue
		print i
		outID = outDIR+get_clusterID(i)
		with open(inDIR+i,"r") as infile:
			 intree = newick3.parse(infile.readline())
		orthologs = get_ortho_from_rooted_inclade(intree)
		count = 1
		for ortho in orthologs:
			if len(set(get_front_names(ortho))) >= MIN_TAXA:
				with open(outID+".ortho"+str(count)+".tre","w") as outfile:
					outstring = newick3.tostring(ortho)
					#outstring = outstring.replace(":0",":1")
					outfile.write(outstring+";\n")
				count += 1
	treDIR = sys.argv[1]+"/"
	clnDIR = sys.argv[2]+"/"
	outDIR = sys.argv[3]+"/"
	if sys.argv[4] == "y": mask_para = True
	elif sys.argv[4] == "n": mask_para = False
	else:
		print "mask_para? y/n"
		sys.exit()
	filecount = 0
	for i in os.listdir(treDIR):
		if i[-3:] == ".tt" and i[-3:] != ".mm":
			with open(treDIR+i,"r") as infile:
				intree = newick3.parse(infile.readline())
			print i
			clusterID = i.split("_")[0]
			filecount += 1
			unamb_chrDICT = {} #key is seqid, value is number of unambiguous chrs
			with open(clnDIR+clusterID+ALIGNMENT_FILE_ENDING) as handle:
				for record in SeqIO.parse(handle,"fasta"):
					seqid,seq = str(record.id), str(record.seq)
					for ch in ['-','X',"x","?","*"]:
						seq = seq.replace(ch,"") #ignore gaps, xs and Xs
					unamb_chrDICT[seqid] = len(seq)
			curroot = monophyly_masking(intree,unamb_chrDICT)
			if mask_para:
				curroot = paraphyly_masking(curroot,unamb_chrDICT)
			with open(outDIR+i+".mm","w") as outfile:
				outfile.write(newick3.tostring(curroot)+";\n")
	if filecount == 0:
		print "No file name with 'best' or 'tt' or 'fasttree' found in the treDIR"
        nc = n
        while (not nc.istip) and len(nc.children) == 0:
            print "pruning an empty tip"
            np = nc.parent
            nc.prune()
            if np:
                nc = np
            else:
                break
        
    #    if not n.istip:
    #        if len(n.children) == 0:
    #            nodes_to_remove.insert(0,n)
    #        else:
    #            empty = True
    #            for c in n.children:
    #                if c not in nodes_to_remove:
    #                    empty = False
    #                    break
    #        if empty:
    #            nodes_to_remove.insert(0,n)

    #print ""
    #for dud in nodes_to_remove:
    #    print "removing an empty tip!"
    #    dud.parent.remove_child(dud)

    outfile = open(treefname.rsplit(".tre",1)[0] + ".renamed.tre","w")
    outfile.write(newick3.tostring(tree) + ";")
    outfile.close()
Пример #11
0
                tip.prune()

            # compress knuckle if there is one
    #        if len(parent.children) == 1:
    #            child = parent.children[0]
    #            if child.label != None:
    #                rightlabel = child.label
    #            else:
    #                rightlabel = ", ".join([leaf.label for leaf in child.leaves()])
    #            print "compressing a knuckle in the tree: " + leftlabel + " | " + rightlabel
    #            pp = parent.parent
    #            pp.remove_child(parent)
    #            pp.add_child(child)

    #nodes_to_remove = []
    for n in tree.descendants():

        nc = n
        while (not nc.istip) and len(nc.children) == 0:
            print "pruning an empty tip"
            np = nc.parent
            nc.prune()
            if np:
                nc = np
            else:
                break

    outfile = open(treefname.rsplit(".tre",1)[0] + ".pruned.tre","w")
    outfile.write(newick3.tostring(tree) + ";")
    outfile.close()
Пример #12
0
					if seqid != "":
						seqDICT[seqid] = seq
					seqid,seq = line[1:].replace("-","_"),""
				else: seq += line.strip()
		seqDICT[seqid] = seq #add the last record
		infile.close()

		#cut by the first cutoff
		print i,"Cutting branches longer than",cutoff1
		outfile = open(DIR+ccID+".cut1.trees","w")
		newtrees = [] #store trees in need of cutting for the next round
		count = 0
		while True:
			for tree in trees:
				if find_longest_internal_branch_length(tree) < cutoff1:
					outfile.write(newick3.tostring(tree) +";\n")
					#can be the original tree or cut tree. no need to cut tip here
				else:
					subtrees = cut_long_branches(tree,cutoff1)
					for subtree in subtrees:
						if count_ingroups(subtree) < MIN_INGROUP_TAXA: continue
						count += 1
						newname = DIR+ccID+".cut1-"+str(count)
						with open(newname+".cutbranch","w") as outfile1: #record the cut branch
							outfile1.write(newick3.tostring(subtree)+";\n")
						with open(newname+".fa","w") as outfile2: #output fasta
							for label in get_leaf_labels(subtree.leaves()):
								outfile2.write(">"+label+"\n"+seqDICT[label]+"\n")
						newaln = mafft_align(newname+".fa")
						newcln = phyutility_clean_alignment(newaln)
						newtreefile = fasttree(newcln)
Пример #13
0
		#check taxonIDs
		ingroup_names = []
		outgroup_names = []
		for name in all_names:
			if name in INGROUPS:
				ingroup_names.append(name)
			elif name in OUTGROUPS:
				outgroup_names.append(name)
			else:
				print name,"not in ingroups or outgroups"
				sys.exit()
		if len(set(ingroup_names)) < MIN_INGROUP_TAXA:
			print "not enough ingroup taxa in tree"
			continue
		if len(outgroup_names) == 0:
			print "No outgroup in tree"
			continue

		inclades = extract_rooted_ingroup_clades(curroot,INGROUPS,OUTGROUPS,MIN_INGROUP_TAXA)
		inclade_count = 0
		for inclade in inclades:
			inclade_count += 1
			with open(outDIR+treefile+"."+str(inclade_count),"w") as outfile:
				outfile.write(newick3.tostring(inclade)+";\n")
			for node in inclade.iternodes():
				if node.istip:
					node.label = get_name(node.label) # output multi-labeled tree for phyparts
			with open(phypartsDIR+treefile+"."+str(inclade_count),"w") as outfile:
				outfile.write(newick3.tostring(inclade)+";\n")
		print inclade_count,"clades extracted"
def mask_paraphyletic_tips(curroot,ignore=[]):
	going = True
	while going and curroot != None and len(curroot.leaves()) >= 4:
		going = False
		for node in curroot.iternodes(): #walk through nodes
			if not node.istip: continue #only look at tips
			name = get_name(node.label).split("_")[1]
			parent = node.parent
			if node == curroot or parent == curroot or parent == None:
				continue #no paraphyletic tips for the root
			for para in parent.get_sisters():
				if para.istip and name==get_name(para.label).split("_")[1]: # mask
					node = para.prune()	
					if len(curroot.leaves()) >= 4:
						if (node==curroot and node.nchildren==2) or (node!=curroot and node.nchildren==1):
							node,curroot = remove_kink(node,curroot)
					going = True
					break
	return curroot

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print "usage: python "+sys.argv[0]+" treefile para(y/n)"
        sys.exit()

    intree = newick3.parse(open(sys.argv[1],"r").readline())
    masked = mask_monophyletic_tips(intree,ignore=[])
    if sys.argv[2] == "y":
        masked = mask_paraphyletic_tips(masked,ignore=[])
    print newick3.tostring(masked)+";\n" 
Пример #15
0
def RT(homoDIR, tree_file_eneding, outDIR, min_ingroup_taxa,
       taxon_code_file_file):
    if homoDIR[-1] != "/": homoDIR += "/"
    if outDIR[-1] != "/": outDIR += "/"
    min_ingroup_taxa = int(min_ingroup_taxa)

    INGROUPS = []
    OUTGROUPS = []
    with open(taxon_code_file_file, "r") as infile:
        for line in infile:
            if len(line) < 3: continue
            spls = line.strip().split("\t")
            if spls[0] == "IN": INGROUPS.append(spls[1])
            elif spls[0] == "OUT": OUTGROUPS.append(spls[1])
            else:
                print "Check taxon_code_file file format"
                sys.exit()
    if len(set(INGROUPS) & set(OUTGROUPS)) > 0:
        print "Taxon ID", set(INGROUPS) & set(
            OUTGROUPS), "in both ingroups and outgroups"
        sys.exit(0)
    print len(INGROUPS), "ingroup taxa and", len(
        OUTGROUPS), "outgroup taxa read"
    print "Ingroups:", INGROUPS
    print "Outgroups:", OUTGROUPS

    for treefile in os.listdir(homoDIR):
        if not treefile.endswith(tree_file_eneding): continue
        with open(homoDIR + treefile, "r") as infile:
            intree = newick3.parse(infile.readline())
        curroot = intree
        all_names = tree_utils.get_front_names(curroot)
        num_tips = len(all_names)
        num_taxa = len(set(all_names))
        print treefile

        #check taxonIDs
        ingroup_names = []
        outgroup_names = []
        for name in all_names:
            if name in INGROUPS:
                ingroup_names.append(name)
            elif name in OUTGROUPS:
                outgroup_names.append(name)
            else:
                print name, "not in ingroups or outgroups"
                sys.exit()
        if len(set(ingroup_names)) < min_ingroup_taxa:
            print "not enough ingroup taxa in tree"
            continue

        outID = outDIR + tree_utils.get_clusterID(treefile)
        if len(outgroup_names
               ) > 0:  #at least one outgroup present, root and cut inclades
            inclades = tree_utils.extract_rooted_ingroup_clades(curroot,\
             INGROUPS,OUTGROUPS,min_ingroup_taxa)
            inclade_count = 0
            for inclade in inclades:
                inclade_count += 1
                inclade_name = outID + ".inclade" + str(inclade_count)
                with open(inclade_name, "w") as outfile:
                    outfile.write(newick3.tostring(inclade) + ";\n")
                orthologs = tree_utils.get_ortho_from_rooted_inclade(inclade)
                ortho_count = 0
                for ortho in orthologs:
                    if len(tree_utils.get_front_labels(
                            ortho)) >= min_ingroup_taxa:
                        ortho_count += 1
                        with open(
                                inclade_name + ".ortho" + str(ortho_count) +
                                ".tre", "w") as outfile:
                            outfile.write(newick3.tostring(ortho) + ";\n")

        elif len(all_names) == num_taxa:
            #only output ortho tree when there is no taxon repeats
            with open(outID + ".unrooted-ortho.tre", "w") as outfile:
                outfile.write(newick3.tostring(curroot) + ";\n")

        else:  #do not attempt to infer direction of gene duplication without outgroup info
            print "duplicated taxa in unrooted tree"
Пример #16
0
        print "python trim_tips.py treDIR tree_file_ending outDIR relative_cutoff absolute_cutoff1 absolute_cutoff2"
        sys.exit(0)

    treDIR = sys.argv[1] + "/"
    file_ending = sys.argv[2]
    outDIR = sys.argv[3] + "/"
    relative_cutoff = float(sys.argv[4])
    absolute_cutoff1 = float(sys.argv[5])
    absolute_cutoff2 = float(sys.argv[6])

    done = []  #record clusterIDs that are done
    for i in os.listdir(treDIR):
        if i[-3:] == ".tt":
            done.append(i.split(".")[0])
    print done

    filecount = 0
    l = len(file_ending)
    for i in os.listdir(treDIR):
        if file_ending in i:
            clusterID = i.split(".")[0]
            if clusterID in done: continue
            print i
            filecount += 1
            with open(treDIR + i, "r") as infile:
                intree = newick3.parse(infile.readline())
            with open(outDIR + i + ".tt", "w") as outfile:
                outfile.write(newick3.tostring(cut_long_tips(intree)) + ";\n")

    if filecount == 0:
        print "No file name with", file_ending, "found in the treDIR"
				if sister.istip and get_name(node.label)==get_name(sister.label):
					if node.length > sister.length:
						node = node.prune()			
					else: node = sister.prune()
					if len(curroot.leaves()) >= 4:
						if node.nchildren==1 or (node==curroot and node.nchildren==2):
							node,curroot = remove_kink(node,curroot)
							#no kink if the original node had more than 2 children
					going = True
					break
	return curroot

if __name__ == "__main__":
	if len(sys.argv) != 2:
		print "python mask_tips_by_taxonID_genomes.py DIR"
		sys.exit()

	DIR = sys.argv[1]+"/"
	filecount = 0
	for i in os.listdir(DIR):
		if i[-3:] == ".tt": #only mask trees that have tips trimmed
			with open(DIR+i,"r") as infile:
				intree = newick3.parse(infile.readline())
			print i
			filecount += 1
			with open(DIR+i+".mm","w") as outfile:
				outfile.write(newick3.tostring(monophyly_masking_by_bl(intree))+";\n")
				
	if filecount == 0:
		print "No file name with 'best' or 'tt' or 'fasttree' found in the treDIR"
Пример #18
0
			#check to make sure that the ingroup and outgroup names were set correctly
			for name in names:
				if name not in INGROUPS and name not in OUTGROUPS:
					print "check name",name
					sys.exit()
			outgroup_names = get_front_outgroup_names(curroot)
			
			#if no outgroup at all, do not attempt to resolve gene duplication
			if len(outgroup_names) == 0:
				print "duplicated taxa in unrooted tree"
				
			#skip the homolog if there are duplicated outgroup taxa
			elif len(outgroup_names) > len(set(outgroup_names)): 
				print "outgroup contains taxon repeats"
				
			else: #at least one outgroup present and there's no outgroup duplication
				if curroot.nchildren == 2: #need to reroot
					temp,curroot = remove_kink(curroot,curroot)
				curroot = reroot_with_monophyletic_outgroups(curroot)
				#only return one tree after prunning
				if curroot != None:
					with open(outID+".reroot","w") as outfile:
						outfile.write(newick3.tostring(curroot)+";\n")
					ortho = prune_paralogs_from_rerooted_homotree(curroot)
					if len(set(get_front_names(curroot))) >= MIN_TAXA:
						with open(outID+".ortho.tre","w") as outfile:
							outfile.write(newick3.tostring(ortho)+";\n")
					else: print "not enough taxa after pruning"
				else: print "outgroup non-monophyletic"
			
Пример #19
0
	
	filecount = 0
	for i in os.listdir(inDIR):
		if "best" not in i and i[-3:] != ".mm": continue
		filecount += 1
		if "best" in i:
			clusterID = i.split(".")[1]
		else: clusterID = i.split(".")[0]
		print clusterID
		with open(inDIR+i,"r") as infile: #only 1 tree in each file
			intree = newick3.parse(infile.readline())
		curroot = intree
		if count_ingroup_taxa(curroot) < min_ingroup_taxa: continue
		subtrees = cut_long_internal_branches(curroot,branch_len_cutoff)
		if len(subtrees) > 0:
			count = 1
			for subtree in subtrees:
				if count_ingroup_taxa(subtree)>=min_ingroup_taxa and count_outgroup_taxa(subtree)>=min_outgroup_taxa:
					#fix bifurcating roots from cutting
					if subtree.nchildren == 2:
						subtree,subtree = remove_kink(subtree,subtree)
					with open(outDIR+clusterID+"_subtree"+str(count)+".tre","w") as outfile:
						outfile.write(newick3.tostring(subtree)+";\n")
						count += 1
	
	if filecount == 0:
		print "No file end with",file_ending,"found"
			
		
		
Пример #20
0
		print "python trim_tips.py treDIR tree_file_ending outDIR relative_cutoff absolute_cutoff1 absolute_cutoff2"
		sys.exit(0)

	treDIR = sys.argv[1]+"/"
	file_ending = sys.argv[2]
	outDIR = sys.argv[3]+"/"
	relative_cutoff = float(sys.argv[4])
	absolute_cutoff1 = float(sys.argv[5])
	absolute_cutoff2 = float(sys.argv[6])
	
	done = [] #record clusterIDs that are done
	for i in os.listdir(treDIR):
		if i[-3:] == ".tt":
			done.append(i.split(".")[0])
	print done
	
	filecount = 0
	l = len(file_ending)
	for i in os.listdir(treDIR):
		if file_ending in i:
			clusterID = i.split(".")[0]
			if clusterID in done: continue
			print i
			filecount += 1
			with open(treDIR+i,"r") as infile:
				intree = newick3.parse(infile.readline())
			with open(outDIR+i+".tt","w") as outfile:
				outfile.write(newick3.tostring(cut_long_tips(intree))+";\n")
	
	if filecount == 0:
		print "No file name with",file_ending,"found in the treDIR"