Exemplos de parse em Python, exemplos de newick3.parse em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: cut_long_internal_branches.py Projeto: NatJWalker-Hale/DODA

def main(inDIR, file_ending, branch_len_cutoff, min_taxa, outDIR):
    """cut long branches and output subtrees as .subtre files
	if uncut and nothing changed betwee .tre and .subtree
	copy the original .tre file to the outdir"""
    if inDIR[-1] != "/": inDIR += "/"
    if outDIR[-1] != "/": outDIR += "/"
    min_taxa = int(min_taxa)
    filecount = 0
    cutoff = float(branch_len_cutoff)
    print "cutting branches longer than", cutoff
    for i in os.listdir(inDIR):
        if not i.endswith(file_ending): continue
        print i
        filecount += 1
        with open(inDIR + i, "r") as infile:  #only 1 tree in each file
            intree = newick3.parse(infile.readline())
        try:
            with open(inDIR + i[:i.find(".tre")] + ".tre",
                      "r") as infile:  #the original .tre
                raw_tree_size = len(
                    get_front_labels(newick3.parse(infile.readline())))
        except:  # did not refine this round. Use the .tre.tt.mm tree
            raw_tree_size = len(get_front_labels(intree))
        num_taxa = count_taxa(intree)
        if num_taxa < min_taxa:
            print "Tree has", num_taxa, "less than", min_taxa, "taxa"
        else:
            print ".tre:", raw_tree_size, "tips; " + file_ending + ": " + str(
                len(get_front_labels(intree))) + " tips"
            subtrees = cut_long_internal_branches(intree, cutoff)
            if len(subtrees) == 0:
                print "No tree with at least", min_taxa, "taxa"
            #elif raw_tree_size == len(subtrees[0].leaves()):
            #copy(inDIR+i,outDIR+i)
            #print "written to out directory unchanged"
            else:
                count = 0
                outsizes = ""
                for subtree in subtrees:
                    if count_taxa(subtree) >= min_taxa:
                        if subtree.nchildren == 2:  #fix bifurcating roots from cutting
                            temp, subtree = remove_kink(subtree, subtree)
                        count += 1
                        with open(
                                outDIR + i.split(".")[0] + "_" + str(count) +
                                ".subtree", "w") as outfile:
                            outfile.write(newick3.tostring(subtree) + ";\n")
                        outsizes += str(len(subtree.leaves())) + ", "
                print count, "tree(s) wirtten. Sizes:", outsizes
    assert filecount > 0, "No file end with " + file_ending + " in " + inDIR

Exemplo n.º 2

0

Exibir arquivo

Arquivo: taxon_name_subst_rev.py Projeto: NatJWalker-Hale/DODA

def taxon_name_subst(original,table=sys.path[0]+"/reverse_taxon_table"):
	DICT = {} # key is seq acronym, value is full taxon name, separated by tab
	with open(table, "r") as infile:
		for line in infile:
			spls = line.strip().split("\t")
			if len(spls) > 1:
				DICT[spls[0].replace("|","_")] = spls[1]
	
	with open(original,"r") as infile:
		line = infile.readline()
		is_fasta = True if line[0] == ">" else False
	
	if is_fasta: # for fasta files
		infile = open(original,"r")
		outfile = open(original+".name","w")
		for line in infile:
			if line[0] == ">":
				outfile.write('>'+get_long_id(line.strip()[1:],DICT)+"\n")
			else: outfile.write(line)
		infile.close()
		outfile.close()
	else: # tree file
		with open(original,"r") as infile:
			intree = newick3.parse(infile.readline())
		for i in intree.leaves():
			print i.label,
			i.label = get_long_id(i.label,DICT)
			print i.label
		with open(original+".name","w") as outfile:
			outfile.write(newick3.tostring(intree)+";\n")

Exemplo n.º 3

0

Exibir arquivo

def main(fasta, treDIR, tree_file_ending, outDIR):
    if treDIR[-1] != "/": treDIR += "/"
    if outDIR[-1] != "/": outDIR += "/"
    print "Reading fasta file", fasta
    seqDICT = {}  #key is seqID, value is seq
    for s in read_fasta_file(fasta):
        seqDICT[s.name] = s.seq
    print "Writing fasta files"
    filecount = 0
    for i in os.listdir(treDIR):
        if i.endswith(tree_file_ending):
            print i
            filecount += 1
            with open(treDIR + i, "r") as infile:
                intree = newick3.parse(infile.readline())
            clusterID = tree_utils.get_clusterID(i)
            if clusterID.endswith("rr"):
                outname = outDIR + clusterID + "_rr.fa"
            else:
                outname = outDIR + clusterID + "rr.fa"
            with open(outname, "w") as outfile:
                for label in tree_utils.get_front_labels(intree):
                    outfile.write(">" + label + "\n" + seqDICT[label] + "\n")
    assert filecount > 0,\
     "No file ends with "+tree_file_ending+" found in "+treDIR

Exemplo n.º 4

0

Exibir arquivo

Arquivo: mask_tips_by_taxonID_transcripts.py Projeto: jlanga/smsk_selection

def main(treDIR, clnDIR, para, intree_file_ending=INTREE_FILE_ENDING):
    if treDIR[-1] != "/": treDIR += "/"
    if clnDIR[-1] != "/": clnDIR += "/"
    assert para == "y" or para == "n", "mask paraphyletic tips? (y/n)"
    mask_para = True if para == "y" else False
    filecount = 0

    filematch = {}  #key is clusterID, value is the .aln-cln file
    for i in os.listdir(clnDIR):
        if i.endswith(".aln-cln"):
            clusterID = get_clusterID(i)
            assert clusterID not in filematch, \
             "The clusterID "+clusterID+" repeats in "+clnDIR
            filematch[clusterID] = i

    for i in os.listdir(treDIR):
        if i.endswith(intree_file_ending):
            with open(treDIR + i, "r") as infile:
                intree = newick3.parse(infile.readline())
            print i
            clusterID = get_clusterID(i)
            filecount += 1
            chrDICT = {}  #key is seqid, value is number of unambiguous chrs
            for s in read_fasta_file(clnDIR + filematch[clusterID]):
                for ch in ['-', 'X', "x", "?", "*"]:
                    s.seq = s.seq.replace(ch, "")  #ignore gaps, xs and Xs
                chrDICT[s.name] = len(s.seq)
            curroot = mask_monophyletic_tips(intree, chrDICT)
            if mask_para: curroot = mask_paraphyletic_tips(curroot, chrDICT)
            with open(treDIR + i + ".mm", "w") as outfile:
                outfile.write(newick3.tostring(curroot) + ";\n")
    assert filecount > 0, \
     "No file ends with "+intree_file_ending+" found in "+treDIR

Exemplo n.º 5

0

Exibir arquivo

def main(treDIR,
         clnDIR,
         para,
         intree_file_ending=INTREE_FILE_ENDING,
         ignore=GENOMES):
    if treDIR[-1] != "/": treDIR += "/"
    if clnDIR[-1] != "/": clnDIR += "/"
    assert para == "y" or para == "n", "mask paraphyletic tips? (y/n)"
    mask_para = True if para == "y" else False
    filecount = 0

    filematch = {}  #key is clusterID, value is the .aln-cln file
    for i in os.listdir(clnDIR):
        if i.endswith(".aln-cln"):
            clusterID = get_clusterID(i)
            assert clusterID not in filematch, \
             "The clusterID "+clusterID+" repeats in "+clnDIR
            filematch[clusterID] = i

    for i in os.listdir(treDIR):
        if i.endswith(intree_file_ending):
            with open(treDIR + i, "r") as infile:
                intree = newick3.parse(infile.readline())
            print i
            clusterID = get_clusterID(i)
            filecount += 1
            curroot = mask(intree,
                           clnDIR + filematch[clusterID],
                           para=mask_para,
                           ignore=GENOMES)
            with open(treDIR + i + ".mm", "w") as outfile:
                outfile.write(newick3.tostring(curroot) + ";\n")
    assert filecount > 0, \
     "No file ends with "+intree_file_ending+" found in "+treDIR

Exemplo n.º 6

0

Exibir arquivo

Arquivo: write_alignments_from_orthologs.py Projeto: jlanga/smsk_selection

def ortho_to_aln(alndir, tredir, outdir, ortho_tree_file_ending=".tre"):
    """
	Read final homolog
	write individual alignment files for each ortholog
	Shorten seq id to taxon id
	"""
    if alndir[-1] != "/": alndir += "/"
    if tredir[-1] != "/": tredir += "/"
    if outdir[-1] != "/": outdir += "/"
    filecount = 0
    for i in os.listdir(tredir):
        if i.endswith(ortho_tree_file_ending):
            filecount += 1
            print i
            #read in the alignment into an dictionary
            seqDICT = {}  #key is seqID, value is seq
            for s in read_fasta_file(alndir + i.split(".")[0] +
                                     ".fa.mafft.aln"):
                seqDICT[s.name] = s.seq

            #read in tree tips and write output alignment
            with open(tredir + i, "r") as infile:
                intree = newick3.parse(infile.readline())
            labels = tree_utils.get_front_labels(intree)
            with open(outdir + i.replace(ortho_tree_file_ending, ".aln"),
                      "w") as outfile:
                for lab in labels:
                    outfile.write(">" + tree_utils.get_name(lab) + "\n" +
                                  seqDICT[lab] + "\n")
    assert filecount > 0,\
     "No file ends with "+ortho_tree_file_ending+" was found in "+tredir

Exemplo n.º 7

0

Exibir arquivo

def get_121(indir, tree_file_ending, min_taxa, outdir, min_bootstrap=0.0):
    if indir[-1] != "/": indir += "/"
    if outdir[-1] != "/": outdir += "/"
    min_taxa = int(min_taxa)
    min_bootstrap = float(min_bootstrap)
    infile_count, outfile_count = 0, 0
    print "Filter one-to-one homologs with average bootstrap of at least",\
     min_bootstrap
    for i in os.listdir(indir):
        if not i.endswith(tree_file_ending): continue
        infile_count += 1
        with open(indir + i, "r") as infile:  #only 1 tree in each file
            intree = newick3.parse(infile.readline())
        names = get_front_names(intree)
        num_tips, num_taxa = len(names), len(set(names))
        print "number of tips:", num_tips, "number of taxa:", num_taxa
        if num_tips == num_taxa and num_taxa >= min_taxa:
            if min_bootstrap > 0.0 and not pass_boot_filter(
                    intree, min_bootstrap):
                continue
            print i, "written to out dir"
            outname = i.split(".")[1] + ".1to1ortho.tre"
            os.system("cp " + indir + i + " " + outdir + outname)
            outfile_count += 1
    assert infile_count > 0,\
     "No file ends with "+tree_file_ending+" was found in "+indir
    print infile_count, "files read,", outfile_count, "written to", outdir

Exemplo n.º 8

0

Exibir arquivo

Arquivo: refine_homolog.py Projeto: NatJWalker-Hale/DODA

def refine(query_fasta, start_fasta, deep_paralog_cutoff, num_cores):
    gene_name = get_filename_from_path(query_fasta)[1].split(".")[0]
    outdir, fasta = get_filename_from_path(start_fasta)
    #print outdir,fasta
    deep_paralog_cutoff = float(deep_paralog_cutoff)
    query_ids = [s.name for s in seq.read_fasta_file(query_fasta)]
    new_fasta = []  # list of output refined fasta files
    print outdir, fasta

    # make a tree from the start_fasta
    tree = fasta_to_tree.fasta_to_tree(outdir, fasta, num_cores, "aa")
    if tree == None: return []
    with open(tree, "r") as infile:
        intree = newick3.parse(infile.readline())
    root = trim_tips.trim(intree,
                          relative_cutoff=deep_paralog_cutoff,
                          absolute_cutoff=deep_paralog_cutoff * 2)
    if os.path.exists(outdir + fasta + ".pasta.aln-cln"):
        clnfile = outdir + fasta + ".pasta.aln-cln"
    else:
        clnfile = outdir + fasta + ".mafft.aln-cln"
    root = mask_tips_by_taxonID_transcripts.mask(root,\
     clnfile=clnfile,\
     para="y",
     ignore=GENOMES)
    if root != None:
        with open(tree + ".tt.mm", "w") as outfile:
            outfile.write(newick3.tostring(root) + "\n")
        subtrees = cut_long_internal_branches.cut_long_internal_branches(
            root, cutoff=deep_paralog_cutoff)
        count = 0
        base_name = fasta.split(".")[0]
        seqDICT = {}  # key is seqid, value is seq
        for s in seq.read_fasta_file(start_fasta):
            seqDICT[s.name] = s.seq
        for tree in subtrees:
            if tree == None: continue
            label_set = set(tree_utils.get_front_labels(tree))
            if len(label_set) > 4 and len(label_set & set(query_ids)) > 0:
                count += 1
                with open(outdir + base_name + "_" + str(count) + ".subtree",
                          "w") as outfile:
                    outfile.write(newick3.tostring(tree) + ";\n")
                with open(outdir + base_name + "_" + str(count) + ".fa",
                          "w") as outfile:
                    for seqid in tree_utils.get_front_labels(tree):
                        try:
                            outfile.write(">" + seqid + "\n" + seqDICT[seqid] +
                                          "\n")
                        except:
                            print seqid, "not found in fasta file"
                new_fasta.append(outdir + base_name + "_" + str(count) + ".fa")

    return new_fasta

Exemplo n.º 9

0

Exibir arquivo

Arquivo: trim_tips.py Projeto: NatJWalker-Hale/DODA

def main(DIR, tree_file_ending, relative_cut, absolute_cut):
    if DIR[-1] != "/": DIR += "/"
    filecount = 0
    for i in os.listdir(DIR):
        if i.endswith(tree_file_ending):
            print i
            filecount += 1
            with open(DIR + i, "r") as infile:
                intree = newick3.parse(infile.readline())
            outtree = trim(intree, float(relative_cut), float(absolute_cut))
            if outtree != None:
                with open(DIR + i + ".tt", "w") as outfile:
                    outfile.write(newick3.tostring(outtree) + ";\n")
    assert filecount > 0, \
     "No file end with "+tree_file_ending+" found in "+DIR

Exemplo n.º 10

0

Exibir arquivo

Arquivo: trim_tips_module.py Projeto: wum5/JaltPhylo

def main(DIR,tree_file_ending,relative_cut,absolute_cut1,absolute_cut2):
	if DIR[-1] != "/": DIR += "/"
	filecount = 0
	for i in os.listdir(DIR):
		if i.endswith(tree_file_ending):
			print i
			filecount += 1
			with open(DIR+i,"r") as infile:
				intree = newick3.parse(infile.readline())
			outtree = trim(intree,float(relative_cut),float(absolute_cut1),float(absolute_cut2))
			if outtree != None:
				with open(DIR+i+".tt","w") as outfile:
					outfile.write(newick3.tostring(outtree)+";\n")
	assert filecount > 0, \
		"No file end with "+tree_file_ending+" found in "+DIR

Exemplo n.º 11

0

Exibir arquivo

Arquivo: paint_branches.py Projeto: chinchliff/physcripts

            if first_line:
                first_line = False
                column_labels = parts
                continue
            
            if len(parts) > 1:
#                print parts[1:]
                data[parts[0]] = dict(zip(column_labels[1:],[float(p) for p in parts[1:]]))
        args.node_values[0].close()

    # load the tree
    tree = None
    while tree == None and line != "":
        line = args.tree[0].readline()
        try:
            tree = newick3.parse(StringIO(line))
        except AttributeError:
            continue
    if tree == None:
        sys.exit("Could not find a tree in: " + args.tree[0].name)
    args.tree[0].close()
    
    # now we will paint the branches
    if args.node_values != None and args.label != None: # use values from the node-values file
        
        for column_label in args.label:

            for node in tree.iternodes():
                if node.label in data:
                    this_bin = get_bin(data[node.label][column_label], color_bins)
                    if this_bin != None:

Exemplo n.º 12

0

Exibir arquivo

Arquivo: subsample_alignment_phylogenetic.py Projeto: chinchliff/physcripts

        help='the tree.')

    parser.add_argument('-q', '--partitions', type=open, required=True, \
        help='the location of the raxml partitions file corresponding to the alignment to be subsampled.')
            
    parser.add_argument('-x', '--random-seed', type=int, required=False, \
        help='an integer seed for the random number generator function')
    
    parser.add_argument('-f', '--reduction-factor', type=float, required=False, \
        help='a decimal value specifying how sparse to make the subsampling. the number of taxa that will be subsampled will be reduced proportionally to this value, thus a value of f 0.5 leads to a reduction by (approximately) half')
    
    parser.add_argument('-n', '--output-label', required=False, default='', \
        help='a label to be attached to output files')

    args = parser.parse_args()
    
    a = Alignment(args.alignment, args.partitions)
    t = newick3.parse(args.tree)
    f = args.reduction_factor if 'reduction_factor' in args else 1
    
    s = PhylogeneticSubsampler(alignment=a, tree=t, rates=args.rates, reduction_factor=f)
    
    s.subsample()

    s.write_subsampled_output(args.output_label)
    
    s.report_sampled_partitions()
    
    print('files have been written to:\n' + s.output_label + '.sampling_matrix.txt\n' + s.output_label + '.phy\n' + s.output_label + '.partitions.txt\n' \
              'sampling proportion is ' + str(s.get_sampling_proportion()))

Exemplo n.º 13

0

Exibir arquivo

                first_line = False
                column_labels = parts
                continue

            if len(parts) > 1:
                #                print parts[1:]
                data[parts[0]] = dict(
                    zip(column_labels[1:], [float(p) for p in parts[1:]]))
        args.node_values[0].close()

    # load the tree
    tree = None
    while tree == None and line != "":
        line = args.tree[0].readline()
        try:
            tree = newick3.parse(StringIO(line))
        except AttributeError:
            continue
    if tree == None:
        sys.exit("Could not find a tree in: " + args.tree[0].name)
    args.tree[0].close()

    # now we will paint the branches
    if args.node_values != None and args.label != None:  # use values from the node-values file

        for column_label in args.label:

            for node in tree.iternodes():
                if node.label in data:
                    this_bin = get_bin(data[node.label][column_label],
                                       color_bins)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: strip_node_labels.py Projeto: chinchliff/physcripts

#!/usr/bin/env python

if __name__ == '__main__':

    import newick3, phylo3, sys

    if len(sys.argv) < 2:
        print "usage: print_tip_names <treefile>"
        sys.exit()

    treefname = sys.argv[1]
    treefile = open(treefname, "r")

    for line in treefile:

        tree = newick3.parse(line)
        print(newick3.to_string(tree, use_node_labels=False))

Exemplo n.º 15

0

Exibir arquivo

Arquivo: create_genus_tree_by_masking.py Projeto: NatJWalker-Hale/alignment_and_tree_tools

def mask_paraphyletic_tips(curroot,ignore=[]):
	going = True
	while going and curroot != None and len(curroot.leaves()) >= 4:
		going = False
		for node in curroot.iternodes(): #walk through nodes
			if not node.istip: continue #only look at tips
			name = get_name(node.label).split("_")[1]
			parent = node.parent
			if node == curroot or parent == curroot or parent == None:
				continue #no paraphyletic tips for the root
			for para in parent.get_sisters():
				if para.istip and name==get_name(para.label).split("_")[1]: # mask
					node = para.prune()	
					if len(curroot.leaves()) >= 4:
						if (node==curroot and node.nchildren==2) or (node!=curroot and node.nchildren==1):
							node,curroot = remove_kink(node,curroot)
					going = True
					break
	return curroot

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print "usage: python "+sys.argv[0]+" treefile para(y/n)"
        sys.exit()

    intree = newick3.parse(open(sys.argv[1],"r").readline())
    masked = mask_monophyletic_tips(intree,ignore=[])
    if sys.argv[2] == "y":
        masked = mask_paraphyletic_tips(masked,ignore=[])
    print newick3.tostring(masked)+";\n"

Exemplo n.º 16

0

Exibir arquivo

Arquivo: fill_tips_from_taxonomy.py Projeto: chinchliff/physcripts

        db_cursor.execute("SELECT name FROM taxonomy WHERE left_value > ? AND " \
            "right_value < ? AND name_class == 'scientific name' AND node_rank LIKE ?", (left_value, right_value, rank.strip()))
        return [row[0] for row in db_cursor.fetchall()]
    #    for row in db_cursor.fetchall():
    #        print row
    
    #    return []

    if len(sys.argv) < 4:
        print("usage: fill_tips_from_taxonomy.py <treefile> <taxonomydb> <outfile>")
        sys.exit(0)

    tree = None
    with open(sys.argv[1],"r") as intree_file:
        tree = newick3.parse(intree_file.readline())
    
        conn = sqlite3.connect(sys.argv[2])
        c = conn.cursor()
    
        for tip in tree.leaves():
            for child_name in get_child_taxa(tip.label, "genus", c):
                child = phylo3.Node()
                child.label = child_name
                child.istip = True
    #            print child.label
                tip.add_child(child)
    #            print([t.label for t in tip.children])
    
    with open(sys.argv[3],"w") as outfile:
    #    print(newick3.to_string(tree)+";")

Exemplo n.º 17

0

Exibir arquivo

Arquivo: RUN.py Projeto: chinchliff/taxonjackknife

def simulate_random_rates(tree_label, tree_function, branch_lengths_function):
        
    indelible_control_file_text = """\
[TYPE] NUCLEOTIDE 2	//  nucleotide simulation using algorithm from method 2.

[MODEL]    gtr1
  [submodel]  GTR {models[0]}
  [statefreq] {statefreqs[0]}

[MODEL]    gtr2
  [submodel]  GTR {models[1]}
  [statefreq] {statefreqs[1]}

[MODEL]    gtr3
  [submodel]  GTR {models[2]}
  [statefreq] {statefreqs[2]}

[MODEL]    gtr4
  [submodel]  GTR {models[3]}
  [statefreq] {statefreqs[3]}

[MODEL]    gtr5
  [submodel]  GTR {models[4]}
  [statefreq] {statefreqs[4]}

[MODEL]    gtr6
  [submodel]  GTR {models[5]}
  [statefreq] {statefreqs[5]}

[MODEL]    gtr7
  [submodel]  GTR {models[6]}
  [statefreq] {statefreqs[6]}

[MODEL]    gtr8
  [submodel]  GTR {models[7]}
  [statefreq] {statefreqs[7]}

[MODEL]    gtr9
  [submodel]  GTR {models[8]}
  [statefreq] {statefreqs[8]}

[MODEL]    gtr10
  [submodel]  GTR {models[9]}
  [statefreq] {statefreqs[9]}

[TREE] tree1 {tree}
[treelength] {tree_lengths[0]}
[TREE] tree2 {tree}
[treelength] {tree_lengths[1]}
[TREE] tree3 {tree}
[treelength] {tree_lengths[2]}
[TREE] tree4 {tree}
[treelength] {tree_lengths[3]}
[TREE] tree5 {tree}
[treelength] {tree_lengths[4]}
[TREE] tree6 {tree}
[treelength] {tree_lengths[5]}
[TREE] tree7 {tree}
[treelength] {tree_lengths[6]}
[TREE] tree8 {tree}
[treelength] {tree_lengths[7]}
[TREE] tree9 {tree}
[treelength] {tree_lengths[8]}
[TREE] tree10 {tree}
[treelength] {tree_lengths[9]}

[PARTITIONS] part1   [tree1 gtr1 {part_length}]
[PARTITIONS] part2   [tree2 gtr2 {part_length}]
[PARTITIONS] part3   [tree3 gtr3 {part_length}]
[PARTITIONS] part4   [tree4 gtr4 {part_length}]
[PARTITIONS] part5   [tree5 gtr5 {part_length}]
[PARTITIONS] part6   [tree6 gtr6 {part_length}]
[PARTITIONS] part7   [tree7 gtr7 {part_length}]
[PARTITIONS] part8   [tree8 gtr8 {part_length}]
[PARTITIONS] part9   [tree9 gtr9 {part_length}]
[PARTITIONS] part10   [tree10 gtr10 {part_length}]

[EVOLVE]
  part1 1 {tree_label}_part_1   //  1 replicate generated from partition 'part1' in file '{tree_label}_part_1.fas'
  part2 1 {tree_label}_part_2   //  1 replicate generated from partition 'part2' in file '{tree_label}_part_2.fas'
  part3 1 {tree_label}_part_3   //  1 replicate generated from partition 'part3' in file '{tree_label}_part_3.fas'
  part4 1 {tree_label}_part_4   //  1 replicate generated from partition 'part4' in file '{tree_label}_part_4.fas'
  part5 1 {tree_label}_part_5   //  1 replicate generated from partition 'part5' in file '{tree_label}_part_5.fas'
  part6 1 {tree_label}_part_6   //  1 replicate generated from partition 'part6' in file '{tree_label}_part_6.fas'
  part7 1 {tree_label}_part_7   //  1 replicate generated from partition 'part7' in file '{tree_label}_part_7.fas'
  part8 1 {tree_label}_part_8   //  1 replicate generated from partition 'part8' in file '{tree_label}_part_8.fas'
  part9 1 {tree_label}_part_9   //  1 replicate generated from partition 'part9' in file '{tree_label}_part_9.fas'
  part10 1 {tree_label}_part_10 //  1 replicate generated from partition 'part10' in file '{tree_label}_part_10.fas'
"""

    # simulation parameters
#    min_transition_rate = 0.5 #birth_rate / 100
#    max_transition_rate = 1.5 #birth_rate / 10
#    min_state_freq = 0.1
#    max_state_freq = 0.3
    part_length = 500
    n_parts = 10
    aln_length = part_length * n_parts

    # repeat until we get an acceptable tree
    while True:
        
        # randomly generate a tree, and calculate a scalar based on its branch lengths/depth
        tree_string = tree_function(branch_lengths_function)
        t = newick3.parse(StringIO(tree_string))
        tree_depth = t.depth
        brlens = t.branch_lengths()
        x = len(brlens)
        median_brlen = brlens[ math.floor(x / 2) + (x % 2)]
#        scaled_tree_length = t.length * (maximal_scaled_brlen / median_brlen)
        
        models = []
        tree_lengths = []
        model_rates = {}
        statefreqs = []
        for j in range(n_parts):

#            m = []
#            t = []
#            for k in range(5): #range(6):
#           
#                # first generate a transition rate
#                g = -1
##                while g < min_transition_rate or g > max_transition_rate:
#                g = random.random() + 0.5
#                m.append(g)                
#
#            # perturb the model more
#            scalar = random.randint(2,20)/float(10)
#            m = [c*scalar for c in m]
#            random.shuffle(m)
#            models.append(" ".join([str(v) for v in m]))
#            model_rates['p'+str(j)] = sum(m)
#            
#            for k in range(3):
#                # now generate a state frequency
#                g = -1
#                while g < min_state_freq or g > max_state_freq:
#                    g = random.random()
#                t.append(g)
#
#            # calculate the final value
#            t.append(str(1 - sum(t)))
#            random.shuffle(t)
#            statefreqs.append(" ".join([str(v) for v in t]))

            slowdown_scalar = 1
            scaled_length = t.subtree_length / ((j+1) * slowdown_scalar)
            tree_lengths.append(scaled_length)

            x = [0.3,0.4,0.5,0.7,0.9,1]
            random.shuffle(x)
            models.append(" ".join([str(v) for v in x]))
            model_rates['p'+str(j)] = sum(x) * scaled_length
            y =  [0.15,0.2,0.3,0.35]
            random.shuffle(y)
            statefreqs.append(" ".join([str(v) for v in y]))

        # save the tree topology to a file
        with open(tree_label + ".tre", "w") as tree_file:
            tree_file.write(tree_string)
#            tree_file.write(";")

        # write a control file for indelible
        with open("control.txt","w") as control_file:
            control_file.write(indelible_control_file_text.format(tree='('+tree_string.strip(';')+');', models=models, \
                    statefreqs=statefreqs, part_length=part_length, tree_label=tree_label, tree_lengths=tree_lengths))

        # simulate data on the tree
        p = subprocess.Popen("indelible", stdout=subprocess.PIPE)
        r = p.communicate()
#        print(r)
#        exit()
        if not "ERROR in [TREE] block" in str(r[0]):
            # there was no error (substring index == -1) so move on 
            break
    
    # combine the alignments and produce a partitions file
    aln = {}
    for j in range(n_parts):
        with open(tree_label+"_part_"+str(j+1)+"_TRUE.phy","r") as p:
            data = read_phylip(p)
        for name, seq in data.items():
            if name not in aln:
                aln[name] = ""
            aln[name] += seq

    alignment_file_name = "%s_combined_aln.phy" % tree_label
    with open(alignment_file_name,"w") as alignment_file:
        alignment_file.write(str(n_tips_per_tree) + " " + str(aln_length) + "\n")
        for name, seq in aln.items():
            alignment_file.write(name + " " + seq + "\n")
    
    partitions_file_name = "%s_combined_part.txt" % tree_label
    with open(partitions_file_name,"w") as partitions_file:
        for j in range(n_parts):
            partitions_file.write("DNA, p{j} = {begin}-{end}\n".format(j=j, begin=j*part_length+1, end=j*part_length+part_length))
            
    # return the tree file, alignment file, and partitions file names
    return (tree_label + ".tre", alignment_file_name, partitions_file_name, model_rates)

Exemplo n.º 18

0

Exibir arquivo


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print "python cut_long_branches_iter.py inDIR outDIR >log"
        sys.exit(0)

    DIR = sys.argv[1] + "/"
    for i in os.listdir(DIR):  #go through fasta files in the input directory
        if i[-9:] != ".fasttree": continue
        fasta_file = DIR + i.replace(".aln-cln.fasttree", "")
        tree_file = DIR + i
        with open(tree_file, "r") as infile:
            first_line = infile.readline()  #there's only 1 tree in each file
        if first_line.strip() == "": continue  #empty file after trimming
        intree = newick3.parse(first_line.replace("-", "_"))
        if count_ingroups(intree) < MIN_INGROUP_TAXA:
            continue  #skip trees with few ingroups
        ccID = i.split(".")[0]  #looks like cc9
        trees = [intree]

        #if intree has no long branches at all jus use the original fasta and alignment
        longest_branch_length = 0.0
        for node in intree.iternodes():
            if node != intree:
                longest_branch_length = max(longest_branch_length, node.length)
        if longest_branch_length < cutoff2:
            os.system("cp " + fasta_file + " " +
                      fasta_file.replace(".fa", ".to_prank.fa"))
            os.system("cp " + fasta_file + ".aln-cln " +
                      fasta_file.replace(".fa", ".to_prank.fa.aln-cln"))

Exemplo n.º 19

0

Exibir arquivo

Arquivo: root_tree_against_master.py Projeto: ldutoit/physcripts

    if (len(bipart0[0].intersection(bipart1[0])) > 0 and len(bipart1[1].intersection(bipart0[0])) == 0 and \
        len(bipart0[1].intersection(bipart1[1])) > 0 and len(bipart1[0].intersection(bipart0[1])) == 0) or \
       (len(bipart0[0].intersection(bipart1[1])) > 0 and len(bipart1[0].intersection(bipart0[0])) == 0 and \
        len(bipart0[1].intersection(bipart1[0])) > 0 and len(bipart1[1].intersection(bipart0[1])) == 0):
        return True
    else:
        return False


if __name__ == "__main__":
    if len(sys.argv) < 3:
        sys.exit(
            "usage: root_tree_against_master.py <treetoroot> <mastertree>")

    with open(sys.argv[1], "r") as target_file:
        target = newick3.parse(target_file)
    if len(target.leaves()) < 3:
        sys.exit(
            "error: cannot perform rooting on a tree with fewer than three tips"
        )

    with open(sys.argv[2], "r") as master_file:
        master = newick3.parse(master_file)
    if len(master.leaves()) < 3:
        sys.exit(
            "error: cannot perform rooting against a master tree with fewer than three tips"
        )

    labels_missing_from_master = get_labels(target.leaves()).difference(
        get_labels(master.leaves()))
    if len(labels_missing_from_master) > 0:

Exemplo n.º 20

0

Exibir arquivo

#!/usr/bin/env python

if __name__ == '__main__':

    import newick3, phylo3, numpy, sys

    if len(sys.argv) < 3:
        print "usage: remove_badtips.py <treefile> <maxdevfactor> [keep=\"<tax1, tax2, ...>\"]"
        sys.exit()

    treefname = sys.argv[1]
    treefile = open(treefname, "r")
    tree = newick3.parse(treefile.readline())
    maxdevfactor = int(sys.argv[2])

    if len(sys.argv) > 3:
        keepnames_str = sys.argv[3].split("keep=",1)[1]
        keepnames = [n.strip() for n in keepnames_str.split(",")]
    else:
        keepnames = []

    lengths = [t.length for t in tree.leaves()]

    avg = numpy.mean(lengths)

    for tip in tree.leaves():
    
        if tip.parent == tree:
            continue

        if tip.length > avg * maxdevfactor:

Exemplo n.º 21

0

Exibir arquivo

Arquivo: cut_long_branches_iter.py Projeto: wum5/JaltPhylo

	return cleaned_alignment+".tre"

if __name__ == "__main__":
	if len(sys.argv) != 3:
		print "python cut_long_branches_iter.py inDIR outDIR >log"
		sys.exit(0)
	
	DIR = sys.argv[1]+"/"
	for i in os.listdir(DIR): #go through fasta files in the input directory
		if i[-9:] != ".fasttree": continue
		fasta_file = DIR+i.replace(".aln-cln.fasttree","")
		tree_file = DIR+i
		with open(tree_file,"r") as infile:
			first_line = infile.readline() #there's only 1 tree in each file
		if first_line.strip() == "": continue #empty file after trimming
		intree = newick3.parse(first_line.replace ("-","_"))
		if count_ingroups(intree) < MIN_INGROUP_TAXA:
			continue #skip trees with few ingroups
		ccID = i.split(".")[0] #looks like cc9
		trees = [intree]
		
		#if intree has no long branches at all jus use the original fasta and alignment
		longest_branch_length = 0.0
		for node in intree.iternodes():
			if node != intree:
				longest_branch_length = max(longest_branch_length,node.length)
		if longest_branch_length < cutoff2:
			os.system("cp "+fasta_file+" "+fasta_file.replace(".fa",".to_prank.fa"))
			os.system("cp "+fasta_file+".aln-cln "+fasta_file.replace(".fa",".to_prank.fa.aln-cln"))
			continue

Exemplo n.º 22

0

Exibir arquivo

Arquivo: add_species_to_chronogram.py Projeto: chinchliff/physcripts

        required=True, help='The list of names to be added to the tree.')

    # allow a min branch length to be specified. it must be parseable as a float
    parser.add_argument('-b', '--min-branch-length', type=float, nargs=1, \
        required=False, help='The minimum branch length to be used.')

    # record a boolean value of True if this argument is set
    parser.add_argument('-s', '--include-stem', action='store_true', \
        required=False, help='Pass this argument to allow newly added species to be '
                             'attached to the root of the tree.')

    args = parser.parse_args()
    
    # attempt to parse the input tree
    try:
        tree = newick3.parse(args.input_tree[0])
    except Exception as e:
        print("There was a problem parsing the input tree: " + e.message)
        exit(1)
    
    # extract the names from the names file
    names = [n.strip() for n in args.names[0]]

    # use the user-specified min branch length if specified
    min_branch_length = args.min_branch_length[0] if args.min_branch_length is not None \
                                                  else MIN_BRANCH_LENGTH

    # assign the function that will be used to gather the nodes--the iternodes function
    # will include the root, but the descendants function will not
    get_nodes = phylo3.Node.iternodes if args.include_stem else phylo3.Node.descendants

Exemplo n.º 23

0

Exibir arquivo

if __name__ == '__main__':

    import newick3, phylo3, sys

    if len(sys.argv) < 2:
        print "usage: excise_knuckles.py <treefile>"
        sys.exit(0)

    treefname = sys.argv[1]
    treefile = open(treefname, "r")

    logfile = open("excise_knuckles.log", "w")

    for line in treefile:

        tree = newick3.parse(line)

        while len(tree.children) < 2:
            # prune knuckles at the root of the tree if necessary
            only_child = tree.children[0]
            only_child.parent = None
            only_child.isroot = True
            tree = only_child

        # cannot edit tree while traversing, so just record knuckles as we go
        knuckles = []

        # first find the knuckles
        for parent in tree.iternodes(phylo3.PREORDER):

            if parent.istip:

Exemplo n.º 24

0

Exibir arquivo

Arquivo: ortholog_occupancy_stats.py Projeto: jlanga/smsk_selection

	return [get_name(i) for i in labels]
	
if __name__ == "__main__":
	if len(sys.argv) != 2:
		print "usage: python ortholog_occupancy_stats.py ortho_treDIR"
		sys.exit(0)
	DIR = sys.argv[1]+"/"
	outfile = open("ortho_stats","w")
	DICT = {} #key is taxon name, value is how many orthologs it is in
	total_ortho = 0
	for i in os.listdir(DIR):
		if i[-len(file_ending):] == file_ending and "ortho" in i:
			print i
			total_ortho += 1
			with open(DIR+i,"r") as infile:
				intree = newick3.parse(infile.readline())
			names = get_front_names(intree)
			for taxon in names:
				if taxon not in DICT:
					DICT[taxon] = 0
				DICT[taxon] += 1
			outfile.write(str(len(names))+"\n")
	outfile.close()
	print "number of taxa in each ortholog written to ortho_stats"
	
	with open("taxon_stats","w") as outfile:
		outfile.write("taxonID\tnum_ortho\t%ortho_out_of_total_"+str(total_ortho)+"\n")
		for taxon in DICT:
			outfile.write(taxon+"\t"+str(DICT[taxon])+"\t"+str(float(DICT[taxon])/total_ortho)+"\n")
	print "number of ortholog for each taxon written to taxon_stats"

Exemplo n.º 25

0

Exibir arquivo

        elif argname == "include":
            cladenames = [n.strip() for n in argval.split(",")]

        elif argname == "includefile":
            includenamesfile = open(argval, "r")
            cladenames = [n.strip() for n in includenamesfile.readlines()]
            includenamesfile.close()

#        elif argname == "exclude":
#            exclude_names = [n.strip() for n in argval.split(",")]

    assert (len(cladenames) > 0)
    assert (target_rank != "")

    test_tree = newick3.parse(open(treefname, "r"))

    print "will assess monophyly for taxa of rank '" + target_rank + "'"

    con = sqlite3.connect(dbname)
    cur = con.cursor()

    included_taxa = {}
    for name in cladenames:
        cur.execute(
            "SELECT name, left_value, right_value FROM taxonomy WHERE name_class == 'scientific name' AND name LIKE ?",
            (name, ))
        for curname, leftval, rightval in cur.fetchall():
            print "including " + name
            included_taxa[name] = (leftval, rightval)

Exemplo n.º 26

0

Exibir arquivo

Arquivo: prune_paralogs_RT.py Projeto: jlanga/smsk_selection

def RT(homoDIR, tree_file_eneding, outDIR, min_ingroup_taxa,
       taxon_code_file_file):
    if homoDIR[-1] != "/": homoDIR += "/"
    if outDIR[-1] != "/": outDIR += "/"
    min_ingroup_taxa = int(min_ingroup_taxa)

    INGROUPS = []
    OUTGROUPS = []
    with open(taxon_code_file_file, "r") as infile:
        for line in infile:
            if len(line) < 3: continue
            spls = line.strip().split("\t")
            if spls[0] == "IN": INGROUPS.append(spls[1])
            elif spls[0] == "OUT": OUTGROUPS.append(spls[1])
            else:
                print "Check taxon_code_file file format"
                sys.exit()
    if len(set(INGROUPS) & set(OUTGROUPS)) > 0:
        print "Taxon ID", set(INGROUPS) & set(
            OUTGROUPS), "in both ingroups and outgroups"
        sys.exit(0)
    print len(INGROUPS), "ingroup taxa and", len(
        OUTGROUPS), "outgroup taxa read"
    print "Ingroups:", INGROUPS
    print "Outgroups:", OUTGROUPS

    for treefile in os.listdir(homoDIR):
        if not treefile.endswith(tree_file_eneding): continue
        with open(homoDIR + treefile, "r") as infile:
            intree = newick3.parse(infile.readline())
        curroot = intree
        all_names = tree_utils.get_front_names(curroot)
        num_tips = len(all_names)
        num_taxa = len(set(all_names))
        print treefile

        #check taxonIDs
        ingroup_names = []
        outgroup_names = []
        for name in all_names:
            if name in INGROUPS:
                ingroup_names.append(name)
            elif name in OUTGROUPS:
                outgroup_names.append(name)
            else:
                print name, "not in ingroups or outgroups"
                sys.exit()
        if len(set(ingroup_names)) < min_ingroup_taxa:
            print "not enough ingroup taxa in tree"
            continue

        outID = outDIR + tree_utils.get_clusterID(treefile)
        if len(outgroup_names
               ) > 0:  #at least one outgroup present, root and cut inclades
            inclades = tree_utils.extract_rooted_ingroup_clades(curroot,\
             INGROUPS,OUTGROUPS,min_ingroup_taxa)
            inclade_count = 0
            for inclade in inclades:
                inclade_count += 1
                inclade_name = outID + ".inclade" + str(inclade_count)
                with open(inclade_name, "w") as outfile:
                    outfile.write(newick3.tostring(inclade) + ";\n")
                orthologs = tree_utils.get_ortho_from_rooted_inclade(inclade)
                ortho_count = 0
                for ortho in orthologs:
                    if len(tree_utils.get_front_labels(
                            ortho)) >= min_ingroup_taxa:
                        ortho_count += 1
                        with open(
                                inclade_name + ".ortho" + str(ortho_count) +
                                ".tre", "w") as outfile:
                            outfile.write(newick3.tostring(ortho) + ";\n")

        elif len(all_names) == num_taxa:
            #only output ortho tree when there is no taxon repeats
            with open(outID + ".unrooted-ortho.tre", "w") as outfile:
                outfile.write(newick3.tostring(curroot) + ";\n")

        else:  #do not attempt to infer direction of gene duplication without outgroup info
            print "duplicated taxa in unrooted tree"

Exemplo n.º 27

0

Exibir arquivo

    parser.add_argument('-q', '--partitions', type=open, required=True, \
        help='the location of the raxml partitions file corresponding to the alignment to be subsampled.')

    parser.add_argument('-x', '--random-seed', type=int, required=False, \
        help='an integer seed for the random number generator function')

    parser.add_argument('-f', '--reduction-factor', type=float, required=False, \
        help='a decimal value specifying how sparse to make the subsampling. the number of taxa that will be subsampled will be reduced proportionally to this value, thus a value of f 0.5 leads to a reduction by (approximately) half')

    parser.add_argument('-n', '--output-label', required=False, default='', \
        help='a label to be attached to output files')

    args = parser.parse_args()

    a = Alignment(args.alignment, args.partitions)
    t = newick3.parse(args.tree)
    f = args.reduction_factor if 'reduction_factor' in args else 1

    s = PhylogeneticSubsampler(alignment=a,
                               tree=t,
                               rates=args.rates,
                               reduction_factor=f)

    s.subsample()

    s.write_subsampled_output(args.output_label)

    s.report_sampled_partitions()

    print('files have been written to:\n' + s.output_label + '.sampling_matrix.txt\n' + s.output_label + '.phy\n' + s.output_label + '.partitions.txt\n' \
              'sampling proportion is ' + str(s.get_sampling_proportion()))

Exemplo n.º 28

0

Exibir arquivo

Arquivo: test_monophyly_against_tree.py Projeto: chinchliff/physcripts

        elif argname == "include":
            cladenames = [n.strip() for n in argval.split(",")]

        elif argname == "includefile":
            includenamesfile = open(argval,"r")
            cladenames = [n.strip() for n in includenamesfile.readlines()]
            includenamesfile.close()

#        elif argname == "exclude":
#            exclude_names = [n.strip() for n in argval.split(",")]

    assert(len(cladenames) > 0)
    assert(target_rank != "")

    test_tree = newick3.parse(open(treefname,"r"))

    print "will assess monophyly for taxa of rank '" + target_rank + "'"

    con = sqlite3.connect(dbname)
    cur = con.cursor()

    included_taxa = {}
    for name in cladenames:
        cur.execute("SELECT name, left_value, right_value FROM taxonomy WHERE name_class == 'scientific name' AND name LIKE ?",(name,))
        for curname, leftval, rightval in cur.fetchall():
            print "including " + name
            included_taxa[name] = (leftval, rightval)
    
    out_prefix = "_".join(included_taxa)
    if len(out_prefix) > 40:

Exemplo n.º 29

0

Exibir arquivo

Arquivo: find_names_missing_from_tree.py Projeto: chinchliff/physcripts

#!/usr/bin/env python

if __name__ == '__main__':

    import newick3, os, sys

    if len(sys.argv) < 3:
        sys.exit("usage: find_names_missing_from_tree.py <datafile> <treefile>")

    data_file_name = sys.argv[1]
    tree_file_name = sys.argv[2]

    names = set()
    with open(tree_file_name, "r") as treefile:
        tree = newick3.parse(treefile.readline())

    for l in tree.leaves():
        names.add(l.label)

    with open(data_file_name, "r") as datafile:
        for line in datafile:
            parts = line.split()
            if len(parts) > 1 and parts[0] not in names:
                print parts[0]

Exemplo n.º 30

0

Exibir arquivo

Arquivo: subsample_edge_quartets.py Projeto: ldutoit/physcripts

        if len(parts) > 1:
            aln[parts[0]] = parts[1]
    args.alignment[0].close()

    # get the tree to subsample
    tree = None
    treefile = args.tree[0]
    print("reading tree from " + treefile.name)
    line = None
    while line != "":
        line = treefile.readline()
        if len(line.strip()) < 1:
            continue
#        print(line)
#        try:
        tree = newick3.parse(StringIO(line))
        #        print('found tree')
        #        print(tree)
        break
#        except AttributeError:
#            pass
    if tree == None:
        sys.exit("Could not find a tree in the treefile: " + treefile.name)
    args.tree[0].close()
    leaves = tree.leaves()

    calc_stop_k = args.stop_node_number[
        0] if args.stop_node_number != None else len(tree.leaves()) + 100
    if calc_stop_k < calc_start_k:
        sys.exit(
            "The start node number is higher than the stop node number, designating no nodes for processing."

Exemplo n.º 31

0

Exibir arquivo

Arquivo: subsample_edge_quartets.py Projeto: chinchliff/physcripts

        if len(parts) > 1:
            aln[parts[0]] = parts[1]
    args.alignment[0].close() 

    # get the tree to subsample
    tree = None
    treefile = args.tree[0]
    print("reading tree from " + treefile.name)
    line = None
    while line != "":
        line = treefile.readline()
        if len(line.strip()) < 1:
            continue
#        print(line)
#        try:
        tree = newick3.parse(StringIO(line))
#        print('found tree')
#        print(tree)
        break
#        except AttributeError:
#            pass
    if tree == None:
        sys.exit("Could not find a tree in the treefile: " + treefile.name)
    args.tree[0].close()
    leaves = tree.leaves()

    calc_stop_k = args.stop_node_number[0] if args.stop_node_number != None else len(tree.leaves())+100
    if calc_stop_k < calc_start_k:
        sys.exit("The start node number is higher than the stop node number, designating no nodes for processing.")

    if args.verbose:

Exemplo n.º 32

0

Exibir arquivo

Arquivo: make_png_from_sampling_matrix.py Projeto: chinchliff/physcripts

#!/usr/bin/env python

import newick3, os, phylo3, png, sys

size_scalar = 4
breakup = 8

if __name__ == "__main__":

    if len(sys.argv) != 4:
        print "usage: make_matrix_fig.py infile.csv infile.tre outfile.png"
        sys.exit(0)

    infile = open(sys.argv[1],"r")
    infiletree = open(sys.argv[2],"r")
    tree = newick3.parse(infiletree.readline())

    order_map = {}
    count = 0
    for i in tree.leaves():
        order_map[i.label] = count
        count += 1

    print str(len(order_map)) + " tips in tree"
    infiletree.close()

    first = True
    sampling_array = ['']*len(order_map)

    # set colors
    palette=[(0xff,0xff,0xff), (0x00,0x00,0x00)]

Exemplo n.º 33

0

Exibir arquivo

        required=True, help='The list of names to be added to the tree.')

    # allow a min branch length to be specified. it must be parseable as a float
    parser.add_argument('-b', '--min-branch-length', type=float, nargs=1, \
        required=False, help='The minimum branch length to be used.')

    # record a boolean value of True if this argument is set
    parser.add_argument('-s', '--include-stem', action='store_true', \
        required=False, help='Pass this argument to allow newly added species to be '
                             'attached to the root of the tree.')

    args = parser.parse_args()

    # attempt to parse the input tree
    try:
        tree = newick3.parse(args.input_tree[0])
    except Exception as e:
        print("There was a problem parsing the input tree: " + e.message)
        exit(1)

    # extract the names from the names file
    names = [n.strip() for n in args.names[0]]

    # use the user-specified min branch length if specified
    min_branch_length = args.min_branch_length[0] if args.min_branch_length is not None \
                                                  else MIN_BRANCH_LENGTH

    # assign the function that will be used to gather the nodes--the iternodes function
    # will include the root, but the descendants function will not
    get_nodes = phylo3.Node.iternodes if args.include_stem else phylo3.Node.descendants

Exemplo n.º 34

0

Exibir arquivo

Arquivo: RUN.py Projeto: chinchliff/taxonjackknife

def run_single_tree(base_dir, simulation_function, tree_function, tree_number, branch_lengths_function=None, subsampling_function=None):

    base_dir += "_" + str(n_random_trees) + "_trees_of_" + str(n_tips_per_tree) + "_tips"
    
    if subsampling_function is not None:
        base_dir += "_SUBSAMPLED_" + subsampling_function.__doc__
    
    if not os.path.exists(base_dir):
        sys.exit("need to initialize the model dir first")

    final_scores_dir = base_dir + "/all_scores"
    all_scores_file_name = final_scores_dir + "/ALL.scores.csv"
    all_times_file_name = final_scores_dir + "/ALL.times.csv"
        
    os.chdir(base_dir)

    tree_label = "tree" + str(tree_number)
    if os.path.exists(tree_label):
        shutil.rmtree(tree_label) 
    working_tree_dir = os.getcwd() + "/" + tree_label
    os.mkdir(working_tree_dir)
    os.chdir(working_tree_dir)

    # run the tree and alignment simulation, get the results    
    simulation_results = simulation_function(tree_label, tree_function, branch_lengths_function)
    tree_file_name = working_tree_dir + "/" + simulation_results[0]
    alignment_file_name = working_tree_dir + "/" + simulation_results[1]
    partitions_file_name = working_tree_dir + "/" + simulation_results[2] if len(simulation_results) > 2 else None
    model_rates = simulation_results[3] if len(simulation_results) > 3 else None
    
    # subsample the alignment if necessary
    if subsampling_function is not None:
        alignment_file_name = subsampling_function(alignment_file_name, partitions_file_name, model_rates, tree_file_name)

    # calculate support for the original tree based on the data, support values will be stored in node_scores.csv
    subsample_args = ["python3", os.path.expanduser("~/scripts/subsample_edge_quartets.py"),
        "-t", tree_file_name,
        "-n", alignment_file_name,
        "-#", n_reps_taxon_jackknife,
        "-T", n_threads,
        "-e", temp_dir,
        "-o", working_tree_dir,
        "-X", raxml_executable]

    if partitions_file_name is not None:
        subsample_args += ["-q", partitions_file_name]

#    print ' '.join(subsample_args)
#    exit()

    start = time.time()
    subprocess.call(subsample_args)
    jackknife_time = time.time() - start

    print("using: " + temp_dir + " for temp files")

    shutil.rmtree(temp_dir)
    os.mkdir(temp_dir)

    print("\ntime for taxon jackknife: %.2f seconds" % jackknife_time)
    with open(all_times_file_name,"a") as timefile:
        timefile.write("%s,%.2f," % (tree_number, jackknife_time))
    
    # get the node scores from the taxon jackknife
    node_scores = {}
    print(os.path.abspath("."))
    with open("node_scores.csv","r") as node_scores_file:
        first_line = True
        for line in node_scores_file:
            if first_line:
                first_line = False
                continue
            parts = [p.strip() for p in line.split(",")]
            if len(parts) < 3:
                continue
            node_scores[parts[0]] = {"j_freq":parts[1],"j_ica":parts[2],"in_true_tree":"TRUE"}

    # make a painted tree for reviewing the taxon jackknife results
    paint_args = ["paint_branches.py",
        "-t", "RESULT.labeled.tre",
        "-c", os.path.expanduser("~/scripts/figtree_color_palettes/blue_gray_red_1_to_neg_1.csv"),
        "-l", "ica",
        "-d", "node_scores.csv",
        "-n", tree_label]
    subprocess.call(paint_args)

    # calculate bootstrap values for the original tree based on the data
    raxml_bootstrap_args = [raxml_pthreads_executable,
        "-n", tree_label,
        "-s", alignment_file_name,
        "-#", n_reps_bootstrap,
        "-x", "123",
        "-p", "123",
        "-m", "GTRCAT",
        "-T", n_threads_bootstrap]
    
    if partitions_file_name is not None:
        subsample_args += ["-q", partitions_file_name]

    start = time.time()
    subprocess.call(raxml_bootstrap_args)
    bootstrap_time = time.time() - start

    print("\ntime for bootstrap: %.2f seconds" % bootstrap_time)
    with open(all_times_file_name,"a") as timefile:
        timefile.write("%.2f\n" % bootstrap_time)

    # call pxbp to generate bipart frequencies from bootstrap replicates
    pxbp_args = ["pxbp", "-t", "RAxML_bootstrap." + tree_label]
    p = subprocess.Popen(pxbp_args, stdout=subprocess.PIPE)
    results = p.communicate()[0]

    # parse pxbp results
    bootstrap_biparts = {}
    for line in results.decode().split('\n'):
#        print(line)
        parts = [p.strip() for p in line.split("\t")]
        if len(parts) < 2:
            continue
        ingroup_labels = tuple(sorted(parts[0].split()))
        freq = None
        ica = None
        if parts[1].strip() == "1":
            freq = "1"
            ica = "1"
        else:
            freq = parts[2]
            ica = parts[4]
        bootstrap_biparts[ingroup_labels] = (freq, ica)
#    print(bootstrap_biparts)
#    exit()

    # TODO: call mrbayes to generate a bayesian posterior distribution

    # load RESULT.labeled.tre
    labeled_tree = None
    true_tree_biparts = set()
    with open("RESULT.labeled.tre","r") as labeled_tree_file:
        labeled_tree = newick3.parse(labeled_tree_file.readline())

    # for each node in the labeled tre
    leaves = labeled_tree.leaves()
    for n in labeled_tree.iternodes():

        # skip the root and the tip nodes
        if n.parent == None or n.istip:
            continue
        
        # special case: if root position is adjacent to a tip
        if n.label not in node_scores and n.parent.parent == None:
            # in this case, the bipart is always defined--if you remove the root then the
            # bipart is the tip vs. the rest of the tree, so the taxon jackknife won't score
            # it (and its label won't show up in the node scores).
            continue

        true_tree_biparts.add(tuple(sorted([t.label for t in n.leaves()]))) 
        
        # record the branch length
        node_scores[n.label]["length"] = n.length
    
        # record the depth
        c = n
        d = 0
        while True:
            c = c.children[0]
            d += c.length
            if c.istip:
                break
        node_scores[n.label]["depth"] = d

        # see if this node's bipart is represented in the bootstraps, if so get the frequency and the ica score
        s = tuple(sorted([t.label for t in n.leaves()]))
        if s in bootstrap_biparts:
            node_scores[n.label]["b_freq"] = bootstrap_biparts[s][0]
            node_scores[n.label]["b_ica"] = bootstrap_biparts[s][1]
        else:
            node_scores[n.label]["b_freq"] = "0"
            node_scores[n.label]["b_ica"] = "NA"

    # process all bootstrap trees, recording info for all branches not in the true tree, so we can calc ica for them
    bootstrap_trees = []
    with open("RAxML_bootstrap." + tree_label, "r") as bootstrap_file:
        for line in bootstrap_file:
            bootstrap_trees.append(newick3.parse(line))

    # pull out all the taxon quartets defined by branches in all bootstrap trees
    bootstrap_quartets = set()
    leaves = labeled_tree.leaves()
    all_taxon_labels = set([l.label for l in leaves])
    for bootstrap_tree in bootstrap_trees:
        for node in bootstrap_tree.iternodes():

            # skip tips and the root node itself
            if node.istip or node.parent == None:
                continue

            # determine if this branch is represented in the true tree
            ingroup_labels = tuple(sorted([t.label for t in node.leaves()]))
            process_branch = False
            if ingroup_labels not in true_tree_biparts:
                remaining_labels = tuple(sorted(all_taxon_labels - set(ingroup_labels)))
                if remaining_labels not in true_tree_biparts:
                    process_branch = True
        
            # if this branch isn't in the true tree, make a tree we can use to evaluate ica for it as well
            if process_branch:
                ########### modified from subsample_edge_quartets
        
                # get leaf sets for the four connected subtrees
            
                # two daughter subtrees
                r1 = set([node.children[0].label,] if node.istip else [l.label for l in node.children[0].leaves()])
                r2 = set([node.children[1].label,] if node.istip else [l.label for l in node.children[1].leaves()])

                # sibling/parent subtrees
                is_other_side_of_root = False # used when we hit the root for the second time
                skip_tip_child_of_root = False # used when one of the children of the root node is a tip
                tip_child_label = None
                for sib in node.parent.children:
                    if sib != node:

                        # if one of the subtrees is the root, skip over it
                        if len(sib.leaves()) + len(node.leaves()) == len(leaves):

                            # if we already processed this bipart (on other side of the root), don't do it again
                            if (root_bipart_label != None):
                                is_other_side_of_root = True
                                break

                            # get the subtrees opposite the root
                            if len(sib.children) == 2:
                                l1 = set([sib.children[0].label,] if sib.children[0].istip else [l.label for l in sib.children[0].leaves()])
                                l2 = set([sib.children[1].label,] if sib.children[1].istip else [l.label for l in sib.children[1].leaves()])
                            elif len(sib.children) == 0:
                                skip_tip_child_of_root = True
                                tip_child_label = sib.label
                            else:
                                print("Node %s does not have exactly 2 children. It will be skipped." % k)
                                continue

                            # remember that we've already done the root, so we can skip it when we hit the other side
                            root_bipart_label = node.label

                        # otherwise not at root, all connected subtrees have children
                        else:

                            # sibling subtree
                            l1 = set([l.label for l in sib.leaves()])

                            # the rest of the tree
                            l2 = set()
                            for label in [l.label for l in leaves]:
                                if label not in r1 and \
                                   label not in r2 and \
                                   label not in l1:
                                        l2.add(label)
                    
                if skip_tip_child_of_root:
                    print("not evaluating tip child '" + tip_child_label + "' of the root (ica is 1.0, as for all tips).")
                    continue

                ######## end modified from subsample_edge_quartets.py
            
                q = (frozenset((tuple(sorted(l1)),tuple(sorted(l2)))),frozenset((tuple(sorted(r1)),tuple(sorted(r2)))))
            
                # make sure this quartet isn't already in the set in reverse before adding it
                if (q[1],q[0]) not in bootstrap_quartets:
                    bootstrap_quartets.add(q)

    print("Found %s bootstrap quartets not in the original tree" % len(bootstrap_quartets))

    # set the sampling interval so we only process max_bs_branches_to_process, evenly distributed across the input set
    sample_freq = len(bootstrap_quartets)/float(max_bs_branches_to_process)
    max_br = min(len(bootstrap_quartets),max_bs_branches_to_process)
    cur_br = 1

    os.chdir(temp_dir)

    for p, q in enumerate(bootstrap_quartets):

        # skip branches that don't fall within our sample based on max_bs_branches_to_process
        if not p % sample_freq < 1:
            continue

        print("on bootstrap quartet %s out of %s" % (cur_br, max_br))
        bad_branch_label = str(p) + "b"

        os.mkdir(bad_branch_label)

        cur_br += 1
            
        tip_label_sets = []
        with open(bad_branch_label+".tre","w") as topo_file:
            clades = []
            for j in q:
                for k in j:
                    tip_label_sets.append(k) 
            for ls in tip_label_sets:
                text = ",".join(ls)
                if len(ls) > 1:
                    text = "(" + text + ")"
                clades.append(text)
            newick = "((" + clades[0] + "," + clades[1] + "),(" + clades[2] + "," + clades[3] + "));"
            topo_file.write(newick)

        # calculate support for the bootstrap-inferred FALSE branch, support values will be stored in node_scores.csv
        subsample_args = ["python3", os.path.expanduser("~/scripts/subsample_edge_quartets.py"),
            "-t", bad_branch_label+".tre",
            "-n", alignment_file_name,
            "-#", n_reps_taxon_jackknife,
            "-T", n_threads,
            "-e", temp_dir + "/" + bad_branch_label,
            "-p", "1", # specify a stop node number that we only want to process the first (i.e. the root) node
            "-X", raxml_executable]
        subprocess.call(subsample_args)     

        # extract the ica score for this node. we only care about the first bipart, so we read the second line in the file
        with open("node_scores.csv","r") as node_scores_file:
            parts = [t.strip() for t in node_scores_file.readlines()[1].split(",")]
            node_scores[bad_branch_label] = {"j_freq":parts[1],"j_ica":parts[2]}

        try:
            ingroup_labels = tuple(sorted(tip_label_sets[0] + tip_label_sets[1]))
#            print(ingroup_labels)
#            print(bootstrap_biparts)
            node_scores[bad_branch_label]["b_freq"] = bootstrap_biparts[ingroup_labels][0]
        except KeyError:
            ingroup_labels = tuple(sorted(tip_label_sets[2] + tip_label_sets[3]))
            node_scores[bad_branch_label]["b_freq"] = bootstrap_biparts[ingroup_labels][0]

        node_scores[bad_branch_label]["b_ica"] = bootstrap_biparts[ingroup_labels][1]
        node_scores[bad_branch_label]["in_true_tree"] = "FALSE"
        node_scores[bad_branch_label]["length"] = "NA"
        node_scores[bad_branch_label]["depth"] = "NA"

        shutil.rmtree(temp_dir + "/" + bad_branch_label)

    os.chdir(working_tree_dir)

    # write scores to file and prepare for next iteration
    with open(all_scores_file_name,"a") as scores_file:
        for node_label, values in node_scores.items():
            scores_file.write(str(tree_number)+","+node_label)
            for c in score_file_column_labels:
                scores_file.write(","+str(values[c]))
            scores_file.write("\n")