def tree_map(tree_root, bipart_list): """This replaces the labels of each node in a species tree with the numbers of entries that apply to that node in bipart_list. WARNING: this function *replaces* the labels in the provided tree, it doesn't make a new tree with different ones new ones. Be careful when you call it. """ bipart_dict = {} # Count the numbers at each node. for bipart in bipart_list: key = str(bipart.species_node) if key in list(bipart_dict.keys()): bipart_dict[key] += 1 else: bipart_dict[key] = 1 # Add the numbers to the tree. You would not believe how long it took # me to get this bit to work for what it is. for key in list(bipart_dict.keys()): label = str(bipart_dict[key]) node = read_trees.node_finder(tree_root, key) node.label = label
def tree_map2(tree_root, rel_list, label): """This replaces the labels of each node in a gene tree with a given label, provided they are in rel_list. """ bipart_dict = {} if tree_root.parent: tree_root = tree_root.parent for rel in rel_list: key = str(rel.ortholog_node) bipart_dict[key] = label for key in list(bipart_dict.keys()): node = read_trees.node_finder(tree_root, key) node.label = label
def tree_map3(tree_root, rel_list, outname, multi_info): """This replaces the labels of each node in a gene tree with the number of times a gene had the relationship """ bipart_dict = {} gene_name_dict = {} duplicate = {} # Species node can only informed once per gene tree in this case for bipart in rel_list: gene_name = str(bipart.gene_name) key = str(bipart.species_node) if key in list(gene_name_dict.keys()): if gene_name not in gene_name_dict[key]: gene_name_dict[key].append(gene_name) # This means the single tree has had more than one con* with the relationship else: # make a key that is both the key and gene name tup = (key, gene_name) if tup in list(duplicate.keys()): duplicate[tup].append(gene_name) # This is the first time it is notices to add it multiple times to account # for the first time when it wasn't added else: duplicate[tup] = [] duplicate[tup].append(gene_name) duplicate[tup].append(gene_name) else: gene_name_dict[key] = [] gene_name_dict[key].append(gene_name) outw_multi = open(multi_info, "w") for key in list(duplicate.keys()): outw_multi.write(key[0] + "," + ";".join(duplicate[key])+"\n") outw_multi.close() # Add the numbers to the tree. You would not believe how long it took # me to get this bit to work for what it is. (Haha, been there) outw = open(outname, "w") for key in list(gene_name_dict.keys()): outw.write(key + "," + ";".join(gene_name_dict[key])+"\n") label = str(len(gene_name_dict[key])) node = read_trees.node_finder(tree_root, key) node.label = label outw.close()
def conflict_stats(conflicts_dict, tree, outfile): """This function should take a dictionary from sort_conflicts and calculate the most common conflict at each node, second-most common, etc. """ # We made this as a dictionary earlier because it was easier to do it # that way then, but now we want to put things in a defined order so we # need a list. stats_dict = {} for node in conflicts_dict.keys(): stats_dict[node] = [] for name in conflicts_dict[node].keys(): conflict_list = conflicts_dict[node][name] new_list = [name, conflict_list] stats_dict[node].append(new_list) outfile.write( "node_id,species_bipart,ortholog_bipart,alternative_conflicts,number_of_conflicts,percentage,genes\n") for node in stats_dict.keys(): # Order all the conflicts within each node from most to least # common. node_on_tree = read_trees.node_finder(tree, node) node_bipart = read_trees.postorder3(node_on_tree) stats_dict[node].sort(reverse=True, key=length_of_2nd_entry) # Get the total so we can calculate percentages. total = 0 for conflict in stats_dict[node]: total += len(conflict[1]) counter = 0 cumulative_percent = 0 for conflict in stats_dict[node]: how_common = len(conflict[1]) percent = float(how_common)/total * 100 # Write each result out to a table. Double-check this! output = [] output.append(str(node)) output.append(";".join(node_bipart.bipart_proper)) output.append(";".join(conflict[1][0].ortholog_bipart)) # Alternative conflicts should be included where they exist. if conflict[1][0].alt_conflict: alternatives = [] alternatives.append( ";".join(sorted(conflict[1][0].alt_conflict))) for i in conflict[1]: include = False for j in alternatives: if i.alt_conflict: if ";".join(sorted(i.alt_conflict)) != j: include = True if include: alternatives.append(";".join(sorted(i.alt_conflict))) output.append(" : ".join(alternatives)) else: output.append("") output.append(str(how_common)) output.append(str(percent)) #get the gene names gene_names_joined = "" gene_names_joined = get_gene_names(conflict[1]) output.append(gene_names_joined) string = ",".join(output) + "\n" outfile.write(string) percent = round(percent, 2) cumulative_percent += percent counter += 1