예제 #1
0
def count_func_cats(file):
	"""count_func_cats takes an eggnog output file as an argument, and counts different functional categories of each OG (note only one category is recorded per OG). These frequncies are stored in a
	functional category frequency dictionary. Note the function executes succesfully regardless of whether the eggnog output has been refined (i.e. parsed out) or is raw."""
	og_list = group.count_ogs(file)
	overall_list = []
	for og in og_list:
		with open(file) as f:
			# The cat_list must be appended for each OG in turn.
			cat_list = []
			for line in f:
				if og in line:
					# Each seq is assigned up to 3 cats, so re must account for this.
					res = re.search(r"\s([A-Z]{1,3})\s", line.strip())
					if res:
						func_cat = res.group(1)
						# Iterates over each letter (for cases where multiple cats are assigned)
						for cat in range(len(func_cat)):
							if func_cat[cat] not in cat_list:
								cat_list.append(func_cat[cat])
			# Ignore eggnog nonhits.
			if len(cat_list) == 0:
				pass
			else:
				for cat in range(len(cat_list)):
					overall_list.append(cat_list[cat])

	# List comprehension is used to generate a dict of counts for each cat in overall_list.
	func_cat_freq_dict = {i:overall_list.count(i) for i in overall_list}

	return func_cat_freq_dict
예제 #2
0
			# This script is to be run in main /OG_arb-fal dir, drawing on total proteome data of groups in /gr_outputs.
			# 3 sys.args (different eukaryote groups) should be specified, which will produce a 3-way venn diagram, saved as a pdf.
			# The venn diagram will return the total proteome overlap of the 3 eukaryote groups.
			# NOTE: this does not return a venn diagram for uniquely shared OGs!
			# NOTE: /gr_outputs only currently contains data for the original 11-way split (need to update!).

from matplotlib_venn import venn3, venn3_circles, venn3_unweighted
from matplotlib import pyplot as plt
import group
import sys

# Creates sets of each group's total proteome (identified by OG numbers).
# sys.args must be valid eukaryote group names in dataset (include caps).
group1 = set(group.count_ogs("gr_outputs/gr_" + sys.argv[1] + ".txt"))
group2 = set(group.count_ogs("gr_outputs/gr_" + sys.argv[2] + ".txt"))
group3 = set(group.count_ogs("gr_outputs/gr_" + sys.argv[3] + ".txt"))

# Add _unweighted to venn3 function to balance the resulting circles (regardless of data).
venn3([group1, group2, group3], (sys.argv[1], sys.argv[2], sys.argv[3]))

plt.savefig("venn3_euk_groups.pdf")

예제 #3
0
# In principle should be applicable to all outputs, providing sp labels are suffixed with "_OGxxxxxxx".
# Important not to discard raw .emapper.annotations files since GOs, kegg etc. are not transferred.
# Script also parses out all ogs that have no hits in eggnog to a separate file.
# Script now also parses bacterial and non-bacterial OGs using a customisable cutoff point.
# These different processes are incorporated here since they all make use of the same og_list.
# Summary stats now added, and query name requested for convenience.
import re
import group

# Ask user for query taxa.
# Append to a list all the uniquely shared OGs between the query taxa using count_ogs.
while True:
    try:
        query = input("Enter query taxa (e.g. Metamonads_Discoba): \n")
        preanno_full_list = group.count_ogs(
            "/mnt/c/Users/scamb/Documents/uob_msc/Genome_data/OG_arb-fal/new_outputs/"
            + query + "_output.txt")
        if preanno_full_list is not None:
            break
    except FileNotFoundError as e:
        print(f'{e} not found. Please enter a valid query.')

# Define code map.
ortho = group.OrthogroupSearch('codes_alt_18')
output = open(query + "_refined_eg.txt", "w")
og_list = []

# Split up the first entry of output file (these are not tab-separated).
sp_label_regex = r"(\w{,8})_\w*[tn|gn]\d+_(\w+\-\w+)_(OG\d+)"

with open(query + ".emapper.annotations") as f:
예제 #4
0
			# This script is to be run in main /OG_arb-fal dir, drawing on total proteome data of groups in /gr_outputs.
			# 3 sys.args (different eukaryote groups) should be specified, which will produce a 3-way venn diagram, saved as a pdf.
			# The venn diagram will return the total proteome overlap of the 3 eukaryote groups.
			# NOTE: this does not return a venn diagram for uniquely shared OGs!
			# NOTE: /gr_outputs only currently contains data for the original 11-way split (need to update!).

from matplotlib_venn import venn3, venn3_circles, venn3_unweighted
from matplotlib import pyplot as plt
import group
import sys

path = "/mnt/c/Users/scamb/Documents/uob_msc/Genome_data/OG_arb-fal/gr_outputs/gr_"

# Creates sets of each group's total proteome (identified by OG numbers).
# sys.args must be valid eukaryote group names in dataset (include caps).
group1 = set(group.count_ogs(path + sys.argv[1] + ".txt"))
group2 = set(group.count_ogs(path + sys.argv[2] + ".txt"))
group3 = set(group.count_ogs(path + sys.argv[3] + ".txt"))

# Add _unweighted to venn3 function to balance the resulting circles (regardless of data).
venn3([group1, group2, group3], (sys.argv[1], sys.argv[2], sys.argv[3]))

plt.savefig("venn3_euk_groups.pdf")

예제 #5
0
from shutil import copy
import group

stram_ogs = "new_outputs/Stramenopiles_Stramenopiles_output.txt"
dir = "stramenopile_fal"
ogs = group.count_ogs(stram_ogs)

# Using copy (rather than copyfile), since 2nd arg can be a dir.
for og in ogs:
    og = og + ".fal"
    copy(og, dir)
# This script will print out the groups present in certain OGs.
# Is only a temporary script - just needed to see the constituent taxa of 'Other' in certain OGs.
# The OGs in question are from 4-way ventral groove-bearing eukaryote sets.
import group
import re

vg = "setquery_outputs/ventral_groove/ventral_groove_minus_one.txt"
og_list = []
og_list = group.count_ogs(vg)

# Append str to og_list using list comprehension.
# Note: a for loop og + .fal will not keep the changes.
og_list = [og + ".fal" for og in og_list]
to_parse = og_list

code_map = group.alt_codes_18()

# Lifted from find_group.
for falfile in to_parse:
    groups_present = []
    with open(falfile) as f:
        for line in f:
            if line.startswith('>'):
                fields = re.split('_', line)  # Separates sp. code
                species_code = fields[0][1:]  # Removes '>'
                for i in code_map:  # Linking sp. code to group
                    gr = code_map[species_code]
                    if gr not in groups_present:  # Adding to group array if new group
                        groups_present.append(gr)
    print(falfile, groups_present)