pfam_initial_filtered_output_file = pre.pfam_repeat_stats_initial_filtered_file #Report OGs that have repeats in less than 4 proteins, possibly exclude if desired. flag_report_lt4 = False pfam_exclude_ogs_file = pre.root + "exclude_ogs_lt4_pfam.lst" meme_exclude_ogs_file = pre.root + "exclude_ogs_lt4_meme.lst" ## Generate PFAM summary pfam_summary = {} pfam_file_list = glob.glob(pre.pfam_treefix_path + '*.treefix.nhx.tree') excluded_ogs = [] for pfam_file in pfam_file_list: og_hit_id = pfam_file[len(pre.pfam_treefix_path):-len('.treefix.nhx.tree')] pfam_tblout = pre.pfam_tblout_path + og_hit_id + '.tblout' if not pre.file_notempty(pfam_tblout): continue repeats = pre.parse_domtblout_stats(pfam_tblout) domain = '_'.join(og_hit_id.split('_')[2:]) if len(repeats[domain]['orthologs_dict'].items()) < 4: excluded_ogs.append(og_hit_id) continue pfam_summary[og_hit_id] = repeats print("Completed Pfam output: ", len(pfam_file_list), " entries in stats: ", len(pfam_summary)) with open(pre.root + pfam_output_file, 'w') as output: output.write(json.dumps(pfam_summary))
def generate_label_string_genetree(orthologs_dict): species_mapping = pre.get_species_mapping_full(pre.species_mapping_file) template_string = '' for protein_id,orth_val in orthologs_dict.items(): ensembl_id = filter(str.isalpha,str(protein_id)) if ensembl_id in species_mapping: name = species_mapping[ensembl_id]['common_name'] else: name = ensembl_id template_string += protein_id+','+name+'\n' return(template_string) if not pre.file_notempty(itol_template_path+og_id+'_domains.txt'): template_string = 'DATASET_DOMAINS\nSEPARATOR COMMA\nDATASET_LABEL,domains\nCOLOR,#ff0000\nDATA\n' if og_id in pfam_summary: for domain, dom_val in pfam_summary[og_id].items(): label = domain template_string += generate_domain_string(dom_val['orthologs_dict'], label) if og_id in meme_summary: label = '' template_string += generate_domain_string(meme_summary[og_id]['motif']['orthologs_dict'], label) if og_id not in pfam_summary and og_id not in meme_summary: print("Unable to find ",og_id) with open(itol_template_path+og_id+'_domains.txt','w') as output: output.write(template_string)
#Add identifier with has both meme and pfam annotation combined_summary_df['identifier'] = np.where( combined_summary_df['og_id_domain'].isna(), combined_summary_df['og_id'], combined_summary_df['og_id_domain']) combined_summary_df['genetree_id'] = combined_summary_df[ 'identifier'].str.split('_').str[0] combined_summary_df['gene_id'] = combined_summary_df['identifier'].str.split( '_').str[1] combined_summary_df['og_id'] = combined_summary_df[ 'genetree_id'] + '_' + combined_summary_df['gene_id'] #Add gene symbol if available (necessary for ExAC) combined_summary_df['gene_symbol'] = '' if pre.file_notempty(human_gene_symbol_file): with open(human_gene_symbol_file, 'r') as gene_data: mapping_gene_symbol = {} for line in gene_data: cols = line.strip().split('\t') mapping_gene_symbol[cols[0]] = cols[1] combined_summary_df['gene_symbol'] = combined_summary_df['gene_id'].map( mapping_gene_symbol) # Calculate PRD score mean_netto_dup = combined_summary_df['netto_dup'].mean() combined_summary_df['PRD_score'] = ( combined_summary_df['netto_dup'] - mean_netto_dup) / combined_summary_df['orthologs_cnt']
padding = 0 hmmr_tbl = str(sys.argv[1]) #hammer output repeats = {} pfam_hits = {} fasta_file = str(sys.argv[2]) #orthologs file fasta = {} if hmmr_tbl_input_path in hmmr_tbl: outfile_path = hmmr_tbl.replace(hmmr_tbl_input_path, fasta_output_path) outfile_path = outfile_path.replace(input_ext, output_ext) gene_name = hmmr_tbl[len(hmmr_tbl_input_path):-(len(input_ext))] else: quit('Need valid .tblout from full pfam scan') if pre.file_notempty(results_file): with open(results_file, 'r') as log: results_dict = json.load(log) else: results_dict = {} if gene_name not in results_dict: results_dict[gene_name] = {} #init log #init excluded file if pre.file_notempty(excluded_ogs_file): with open(excluded_ogs_file, 'r') as log: excluded_ogs = json.load(log) else: excluded_ogs['settings'] = { 'repeat_threshold': repeat_threshold, 'species_threshold': species_threshold
from SPARQLWrapper import SPARQLWrapper, JSON import json import pipeline_methods as pre orthologs_output_file = pre.orthologs_file reference_species = 9606 #human sparql = SPARQLWrapper('https://www.ebi.ac.uk/rdf/services/sparql') sparql.setReturnFormat(JSON) mapping = { } #for each gene, save species - ortholog combination, if multiple append to list. if pre.file_notempty(pre.orthologs_file): print("Orthologs already downloaded, delete ", pre.orthologs_file, " to redo") quit() species_list = pre.get_taxon_list('', [reference_species]) species_str = 'taxon:' + ' taxon:'.join([str(x) for x in species_list]) for sp in species_list: sparql.setQuery(""" PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX owl: <http://www.w3.org/2002/07/owl#> PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> PREFIX foaf: <http://xmlns.com/foaf/0.1/>
outfile_path = meme_html.replace(pre.meme_output_path, pre.denovo_meme_repeats_path) outfile_path = outfile_path.replace(input_ext, output_ext) gene_name = meme_html[len(pre.meme_output_path):-(len(input_ext))] print(gene_name) else: quit('Need valid MEME output file') with open(meme_html, 'r') as meme: meme_html = meme.read() json_text = re.compile('var data = ({.*?});', re.DOTALL) matches = json_text.search(meme_html) meme_json = json.loads(matches.group(1)) if pre.file_notempty(log_file): with open(log_file, 'r') as log: meme_results = json.load(log) else: meme_results = {} if gene_name not in meme_results: meme_results[gene_name] = { 'consensus': meme_json['motifs'][0]['id'], 'nsites': meme_json['motifs'][0]['nsites'] } with open(log_file, 'w') as log: log.write(json.dumps(meme_results)) #minimum motifs to make a tree? 10? if meme_json['motifs'][0]['nsites'] < 10: quit()
from SPARQLWrapper import SPARQLWrapper, JSON import pipeline_methods as pre filtered_ogs_file = pre.orthologs_filtered_file log_file = pre.root + 'log_retrieve_genetrees.txt' species_list = pre.get_taxon_list('', []) species_str = '&prune_taxon='.join([str(x) for x in species_list]) ensembl_server = "http://rest.ensembl.org" sparql = SPARQLWrapper('https://www.ebi.ac.uk/rdf/services/sparql') sparql.setReturnFormat(JSON) genes_to_genetrees = {} if (pre.file_notempty(filtered_ogs_file)): with open(filtered_ogs_file, 'r') as filtered_ogs: mapping = json.load(filtered_ogs) else: print("Requires input file ", pre.orthologs_filtered_file) quit() #reset log with open(log_file, 'w') as log: log.write('') #Retrieve gene trees for OGs # extract subtree containing all orthologs for gene_uri in mapping: #json contains raw input of REST
padding = 5 hmmr_tbl = str(sys.argv[1]) #hammer output repeats = {} pfam_hits = {} fasta_file = str(sys.argv[2]) #orthologs file fasta = {} if hmmr_tbl_input_path in hmmr_tbl: outfile_path = hmmr_tbl.replace(hmmr_tbl_input_path, fasta_output_path) outfile_path = outfile_path.replace(input_ext, output_ext) gene_name = hmmr_tbl[ len(hmmr_tbl_input_path):-(len(input_ext)) ] else: quit('Need valid .tblout from full pfam scan') if pre.file_notempty(results_file): with open(results_file,'r') as log: results_dict = json.load(log) else: results_dict = {} if gene_name not in results_dict: results_dict[gene_name]={} #read clans pfam_clans = {} #read fasta fasta = pre.read_fasta(fasta_file) #hmm data repeats, pfam_hits = pre.parse_domtblout(hmmr_tbl, 'iterative', spacing)
## Also writes fasta file for orthoguous groupswaaro import requests, sys, json, os.path import pipeline_methods as pre ensembl_path = pre.ensembl_path genetree_path = pre.genetree_path fasta_path = pre.fasta_path filtered_ogs_file = pre.orthologs_filtered_file log_file = pre.log_parse_genetree_file gene_uri_prefix = 'http://rdf.ebi.ac.uk/resource/ensembl/' gene_id = '' #human gene id genetree_json_file = sys.argv[1] #api output if ensembl_path in genetree_json_file and pre.file_notempty( genetree_json_file): gene_id = genetree_json_file[len(ensembl_path):-len('.json')] with open(genetree_json_file, 'r') as gt_file: genetree_json = json.load(gt_file) else: with open(log_file, mode='ab') as log: log.write("Could not parse: " + genetree_json_file + "\n") quit() if (pre.file_notempty(filtered_ogs_file)): with open(filtered_ogs_file, 'r') as filtered_ogs: mapping = json.load(filtered_ogs) else: quit() #Extract subtree containing all orthologs