def count_func_cats(file): """count_func_cats takes an eggnog output file as an argument, and counts different functional categories of each OG (note only one category is recorded per OG). These frequncies are stored in a functional category frequency dictionary. Note the function executes succesfully regardless of whether the eggnog output has been refined (i.e. parsed out) or is raw.""" og_list = group.count_ogs(file) overall_list = [] for og in og_list: with open(file) as f: # The cat_list must be appended for each OG in turn. cat_list = [] for line in f: if og in line: # Each seq is assigned up to 3 cats, so re must account for this. res = re.search(r"\s([A-Z]{1,3})\s", line.strip()) if res: func_cat = res.group(1) # Iterates over each letter (for cases where multiple cats are assigned) for cat in range(len(func_cat)): if func_cat[cat] not in cat_list: cat_list.append(func_cat[cat]) # Ignore eggnog nonhits. if len(cat_list) == 0: pass else: for cat in range(len(cat_list)): overall_list.append(cat_list[cat]) # List comprehension is used to generate a dict of counts for each cat in overall_list. func_cat_freq_dict = {i:overall_list.count(i) for i in overall_list} return func_cat_freq_dict
# This script is to be run in main /OG_arb-fal dir, drawing on total proteome data of groups in /gr_outputs. # 3 sys.args (different eukaryote groups) should be specified, which will produce a 3-way venn diagram, saved as a pdf. # The venn diagram will return the total proteome overlap of the 3 eukaryote groups. # NOTE: this does not return a venn diagram for uniquely shared OGs! # NOTE: /gr_outputs only currently contains data for the original 11-way split (need to update!). from matplotlib_venn import venn3, venn3_circles, venn3_unweighted from matplotlib import pyplot as plt import group import sys # Creates sets of each group's total proteome (identified by OG numbers). # sys.args must be valid eukaryote group names in dataset (include caps). group1 = set(group.count_ogs("gr_outputs/gr_" + sys.argv[1] + ".txt")) group2 = set(group.count_ogs("gr_outputs/gr_" + sys.argv[2] + ".txt")) group3 = set(group.count_ogs("gr_outputs/gr_" + sys.argv[3] + ".txt")) # Add _unweighted to venn3 function to balance the resulting circles (regardless of data). venn3([group1, group2, group3], (sys.argv[1], sys.argv[2], sys.argv[3])) plt.savefig("venn3_euk_groups.pdf")
# In principle should be applicable to all outputs, providing sp labels are suffixed with "_OGxxxxxxx". # Important not to discard raw .emapper.annotations files since GOs, kegg etc. are not transferred. # Script also parses out all ogs that have no hits in eggnog to a separate file. # Script now also parses bacterial and non-bacterial OGs using a customisable cutoff point. # These different processes are incorporated here since they all make use of the same og_list. # Summary stats now added, and query name requested for convenience. import re import group # Ask user for query taxa. # Append to a list all the uniquely shared OGs between the query taxa using count_ogs. while True: try: query = input("Enter query taxa (e.g. Metamonads_Discoba): \n") preanno_full_list = group.count_ogs( "/mnt/c/Users/scamb/Documents/uob_msc/Genome_data/OG_arb-fal/new_outputs/" + query + "_output.txt") if preanno_full_list is not None: break except FileNotFoundError as e: print(f'{e} not found. Please enter a valid query.') # Define code map. ortho = group.OrthogroupSearch('codes_alt_18') output = open(query + "_refined_eg.txt", "w") og_list = [] # Split up the first entry of output file (these are not tab-separated). sp_label_regex = r"(\w{,8})_\w*[tn|gn]\d+_(\w+\-\w+)_(OG\d+)" with open(query + ".emapper.annotations") as f:
# This script is to be run in main /OG_arb-fal dir, drawing on total proteome data of groups in /gr_outputs. # 3 sys.args (different eukaryote groups) should be specified, which will produce a 3-way venn diagram, saved as a pdf. # The venn diagram will return the total proteome overlap of the 3 eukaryote groups. # NOTE: this does not return a venn diagram for uniquely shared OGs! # NOTE: /gr_outputs only currently contains data for the original 11-way split (need to update!). from matplotlib_venn import venn3, venn3_circles, venn3_unweighted from matplotlib import pyplot as plt import group import sys path = "/mnt/c/Users/scamb/Documents/uob_msc/Genome_data/OG_arb-fal/gr_outputs/gr_" # Creates sets of each group's total proteome (identified by OG numbers). # sys.args must be valid eukaryote group names in dataset (include caps). group1 = set(group.count_ogs(path + sys.argv[1] + ".txt")) group2 = set(group.count_ogs(path + sys.argv[2] + ".txt")) group3 = set(group.count_ogs(path + sys.argv[3] + ".txt")) # Add _unweighted to venn3 function to balance the resulting circles (regardless of data). venn3([group1, group2, group3], (sys.argv[1], sys.argv[2], sys.argv[3])) plt.savefig("venn3_euk_groups.pdf")
from shutil import copy import group stram_ogs = "new_outputs/Stramenopiles_Stramenopiles_output.txt" dir = "stramenopile_fal" ogs = group.count_ogs(stram_ogs) # Using copy (rather than copyfile), since 2nd arg can be a dir. for og in ogs: og = og + ".fal" copy(og, dir)
# This script will print out the groups present in certain OGs. # Is only a temporary script - just needed to see the constituent taxa of 'Other' in certain OGs. # The OGs in question are from 4-way ventral groove-bearing eukaryote sets. import group import re vg = "setquery_outputs/ventral_groove/ventral_groove_minus_one.txt" og_list = [] og_list = group.count_ogs(vg) # Append str to og_list using list comprehension. # Note: a for loop og + .fal will not keep the changes. og_list = [og + ".fal" for og in og_list] to_parse = og_list code_map = group.alt_codes_18() # Lifted from find_group. for falfile in to_parse: groups_present = [] with open(falfile) as f: for line in f: if line.startswith('>'): fields = re.split('_', line) # Separates sp. code species_code = fields[0][1:] # Removes '>' for i in code_map: # Linking sp. code to group gr = code_map[species_code] if gr not in groups_present: # Adding to group array if new group groups_present.append(gr) print(falfile, groups_present)