Пример #1
0
pfam_initial_filtered_output_file = pre.pfam_repeat_stats_initial_filtered_file

#Report OGs that have repeats in less than 4 proteins, possibly exclude if desired.
flag_report_lt4 = False
pfam_exclude_ogs_file = pre.root + "exclude_ogs_lt4_pfam.lst"
meme_exclude_ogs_file = pre.root + "exclude_ogs_lt4_meme.lst"

## Generate PFAM summary
pfam_summary = {}
pfam_file_list = glob.glob(pre.pfam_treefix_path + '*.treefix.nhx.tree')

excluded_ogs = []
for pfam_file in pfam_file_list:
    og_hit_id = pfam_file[len(pre.pfam_treefix_path):-len('.treefix.nhx.tree')]
    pfam_tblout = pre.pfam_tblout_path + og_hit_id + '.tblout'
    if not pre.file_notempty(pfam_tblout): continue
    repeats = pre.parse_domtblout_stats(pfam_tblout)

    domain = '_'.join(og_hit_id.split('_')[2:])

    if len(repeats[domain]['orthologs_dict'].items()) < 4:
        excluded_ogs.append(og_hit_id)
        continue

    pfam_summary[og_hit_id] = repeats

print("Completed Pfam output: ", len(pfam_file_list), " entries in stats: ",
      len(pfam_summary))

with open(pre.root + pfam_output_file, 'w') as output:
    output.write(json.dumps(pfam_summary))
Пример #2
0
def generate_label_string_genetree(orthologs_dict):
	species_mapping = pre.get_species_mapping_full(pre.species_mapping_file)
	
	template_string = ''
	for protein_id,orth_val in orthologs_dict.items():			
		ensembl_id = filter(str.isalpha,str(protein_id))
		if ensembl_id in species_mapping: 
			name = species_mapping[ensembl_id]['common_name']
		else: name = ensembl_id
			
		template_string += protein_id+','+name+'\n'	
			
	return(template_string)


if not pre.file_notempty(itol_template_path+og_id+'_domains.txt'):
	template_string = 'DATASET_DOMAINS\nSEPARATOR COMMA\nDATASET_LABEL,domains\nCOLOR,#ff0000\nDATA\n'

	if og_id in pfam_summary:
		for domain, dom_val in pfam_summary[og_id].items():
			label = domain
			template_string += generate_domain_string(dom_val['orthologs_dict'], label)

	if og_id in meme_summary:
		label = ''
		template_string += generate_domain_string(meme_summary[og_id]['motif']['orthologs_dict'], label)			

	if og_id not in pfam_summary and og_id not in meme_summary: print("Unable to find ",og_id)
	
	with open(itol_template_path+og_id+'_domains.txt','w') as output:
		output.write(template_string)
#Add identifier with has both meme and pfam annotation
combined_summary_df['identifier'] = np.where(
    combined_summary_df['og_id_domain'].isna(), combined_summary_df['og_id'],
    combined_summary_df['og_id_domain'])

combined_summary_df['genetree_id'] = combined_summary_df[
    'identifier'].str.split('_').str[0]
combined_summary_df['gene_id'] = combined_summary_df['identifier'].str.split(
    '_').str[1]
combined_summary_df['og_id'] = combined_summary_df[
    'genetree_id'] + '_' + combined_summary_df['gene_id']

#Add gene symbol if available (necessary for ExAC)
combined_summary_df['gene_symbol'] = ''
if pre.file_notempty(human_gene_symbol_file):
    with open(human_gene_symbol_file, 'r') as gene_data:
        mapping_gene_symbol = {}
        for line in gene_data:
            cols = line.strip().split('\t')
            mapping_gene_symbol[cols[0]] = cols[1]

    combined_summary_df['gene_symbol'] = combined_summary_df['gene_id'].map(
        mapping_gene_symbol)

# Calculate PRD score
mean_netto_dup = combined_summary_df['netto_dup'].mean()
combined_summary_df['PRD_score'] = (
    combined_summary_df['netto_dup'] -
    mean_netto_dup) / combined_summary_df['orthologs_cnt']
Пример #4
0
padding = 0

hmmr_tbl = str(sys.argv[1])  #hammer output
repeats = {}
pfam_hits = {}
fasta_file = str(sys.argv[2])  #orthologs file
fasta = {}

if hmmr_tbl_input_path in hmmr_tbl:
    outfile_path = hmmr_tbl.replace(hmmr_tbl_input_path, fasta_output_path)
    outfile_path = outfile_path.replace(input_ext, output_ext)
    gene_name = hmmr_tbl[len(hmmr_tbl_input_path):-(len(input_ext))]
else:
    quit('Need valid .tblout from full pfam scan')

if pre.file_notempty(results_file):
    with open(results_file, 'r') as log:
        results_dict = json.load(log)
else:
    results_dict = {}
if gene_name not in results_dict: results_dict[gene_name] = {}

#init log
#init excluded file
if pre.file_notempty(excluded_ogs_file):
    with open(excluded_ogs_file, 'r') as log:
        excluded_ogs = json.load(log)
else:
    excluded_ogs['settings'] = {
        'repeat_threshold': repeat_threshold,
        'species_threshold': species_threshold
Пример #5
0
from SPARQLWrapper import SPARQLWrapper, JSON
import json
import pipeline_methods as pre

orthologs_output_file = pre.orthologs_file

reference_species = 9606  #human

sparql = SPARQLWrapper('https://www.ebi.ac.uk/rdf/services/sparql')
sparql.setReturnFormat(JSON)

mapping = {
}  #for each gene, save species - ortholog combination, if multiple append to list.

if pre.file_notempty(pre.orthologs_file):
    print("Orthologs already downloaded, delete ", pre.orthologs_file,
          " to redo")
    quit()

species_list = pre.get_taxon_list('', [reference_species])
species_str = 'taxon:' + ' taxon:'.join([str(x) for x in species_list])

for sp in species_list:

    sparql.setQuery("""
		PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
		PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
		PREFIX owl: <http://www.w3.org/2002/07/owl#>
		PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
		PREFIX foaf: <http://xmlns.com/foaf/0.1/>
Пример #6
0
    outfile_path = meme_html.replace(pre.meme_output_path,
                                     pre.denovo_meme_repeats_path)
    outfile_path = outfile_path.replace(input_ext, output_ext)

    gene_name = meme_html[len(pre.meme_output_path):-(len(input_ext))]
    print(gene_name)
else:
    quit('Need valid MEME output file')

with open(meme_html, 'r') as meme:
    meme_html = meme.read()
    json_text = re.compile('var data = ({.*?});', re.DOTALL)
    matches = json_text.search(meme_html)
    meme_json = json.loads(matches.group(1))

if pre.file_notempty(log_file):
    with open(log_file, 'r') as log:
        meme_results = json.load(log)
else:
    meme_results = {}
if gene_name not in meme_results:
    meme_results[gene_name] = {
        'consensus': meme_json['motifs'][0]['id'],
        'nsites': meme_json['motifs'][0]['nsites']
    }

with open(log_file, 'w') as log:
    log.write(json.dumps(meme_results))

#minimum motifs to make a tree? 10?
if meme_json['motifs'][0]['nsites'] < 10: quit()
Пример #7
0
from SPARQLWrapper import SPARQLWrapper, JSON
import pipeline_methods as pre

filtered_ogs_file = pre.orthologs_filtered_file
log_file = pre.root + 'log_retrieve_genetrees.txt'

species_list = pre.get_taxon_list('', [])
species_str = '&prune_taxon='.join([str(x) for x in species_list])

ensembl_server = "http://rest.ensembl.org"
sparql = SPARQLWrapper('https://www.ebi.ac.uk/rdf/services/sparql')
sparql.setReturnFormat(JSON)

genes_to_genetrees = {}

if (pre.file_notempty(filtered_ogs_file)):
    with open(filtered_ogs_file, 'r') as filtered_ogs:
        mapping = json.load(filtered_ogs)
else:
    print("Requires input file ", pre.orthologs_filtered_file)
    quit()

#reset log
with open(log_file, 'w') as log:
    log.write('')

#Retrieve gene trees for OGs
# extract subtree containing all orthologs
for gene_uri in mapping:

    #json contains raw input of REST
padding = 5

hmmr_tbl = str(sys.argv[1]) #hammer output
repeats = {}
pfam_hits = {}
fasta_file = str(sys.argv[2]) #orthologs file
fasta = {}

if hmmr_tbl_input_path in hmmr_tbl:
	outfile_path = hmmr_tbl.replace(hmmr_tbl_input_path, fasta_output_path) 
	outfile_path = outfile_path.replace(input_ext, output_ext) 
	gene_name = hmmr_tbl[ len(hmmr_tbl_input_path):-(len(input_ext)) ]
else: 
	quit('Need valid .tblout from full pfam scan')

if pre.file_notempty(results_file):
	with open(results_file,'r') as log:
		results_dict = json.load(log)
else:
	results_dict = {}	
if gene_name not in results_dict: results_dict[gene_name]={}

#read clans
pfam_clans = {}

#read fasta
fasta = pre.read_fasta(fasta_file)

#hmm data
repeats, pfam_hits = pre.parse_domtblout(hmmr_tbl, 'iterative', spacing)
Пример #9
0
## Also writes fasta file for orthoguous groupswaaro

import requests, sys, json, os.path
import pipeline_methods as pre

ensembl_path = pre.ensembl_path
genetree_path = pre.genetree_path
fasta_path = pre.fasta_path
filtered_ogs_file = pre.orthologs_filtered_file
log_file = pre.log_parse_genetree_file

gene_uri_prefix = 'http://rdf.ebi.ac.uk/resource/ensembl/'
gene_id = ''  #human gene id
genetree_json_file = sys.argv[1]  #api output

if ensembl_path in genetree_json_file and pre.file_notempty(
        genetree_json_file):
    gene_id = genetree_json_file[len(ensembl_path):-len('.json')]
    with open(genetree_json_file, 'r') as gt_file:
        genetree_json = json.load(gt_file)
else:
    with open(log_file, mode='ab') as log:
        log.write("Could not parse: " + genetree_json_file + "\n")
    quit()

if (pre.file_notempty(filtered_ogs_file)):
    with open(filtered_ogs_file, 'r') as filtered_ogs:
        mapping = json.load(filtered_ogs)
else:
    quit()

#Extract subtree containing all orthologs