def compute_full_phmm(self, location): """realignes the fasta file stored in this object but to a different location""" msa_string = self.generate_msa_string(self.fasta_hash.keys()) with tmp.NamedTemporaryFile() as msa_tmp: write_to_tempfile(msa_tmp.name, msa_string) generate_hmm(location, msa_tmp.name) return location
def best_exonerate_prediction2(region_fasta, query_fasta, dir_path, hmm): with tmp.NamedTemporaryFile() as reg_file: write_to_tempfile(reg_file.name, region_fasta) ex_obj = run_exonerate("-m p2g -E no", "{}.exon_p2g", dir_path, region_fasta, query_fasta) if ex_obj: all_proteins = all_proteins_to_fasta_string(ex_obj) TP_scores = markov_model_scoring2(all_proteins, hmm) if TP_scores: print(TP_scores) max_score = max(list(TP_scores.values())) max_score_header = set([ header.split(";")[0] for header, score in TP_scores.items() if score >= max_score * 0.90 ]) fasta_hash = hash_fasta(query_fasta) # do that before with tmp.NamedTemporaryFile() as q_file: max_val_fasta = "\n".join([ "{}\n{}".format(header, fasta_hash[header]) for header in max_score_header ]) write_to_tempfile(q_file.name, max_val_fasta) ex_name = "{}.exon".format(q_file.name) ex_obj = run_exonerate("-m p2g -E no ", ex_name, dir_path, region_fasta, q_file.name) return ex_obj return None
def markov_model_scoring2(fasta_string, hmm): if hmm: with tmp.NamedTemporaryFile() as ex_file: write_to_tempfile(ex_file.name, fasta_string) score_hash = get_phmm_score(hmm, ex_file.name) if score_hash: return score_hash return None
def generate_msa_string(self, rest_prot): """returns a MSA as string in fasta format, derived from the a list of headers""" seq_list = [ "{}\n{}".format(header, self.fasta_hash[header]) for header in rest_prot ] with tmp.NamedTemporaryFile() as r_tmp: write_to_tempfile(r_tmp.name, "\n".join(seq_list)) list_msa = generate_msa(r_tmp.name) return "\n".join(list_msa)
def bulk_hmm_scoring_model(): with tempfile.NamedTemporaryFile() as tmp: seq_list = [] for header in seq_header_list: seq_list.append(header) seq_list.append(cluster_hash[header]) write_to_tempfile(tmp.name, "\n".join(seq_list)) msa = "\n".join(generate_msa(tmp.name)) with tempfile.NamedTemporaryFile() as msa_tmp: write_to_tempfile(msa_tmp.name, msa) with tempfile.NamedTemporaryFile() as tmp_hmm: hmm = generate_hmm(tmp_hmm.name, msa_tmp.name) score_list = many_hmm_scores(hmm, tmp.name) return score_list
def find_best_exonerate_result(region_tuple, region_fasta, group, cluster, dir_path): '''first aligns all proteins in p2g mode, scores them against the hmm and extracts the highest scoring predictions. Then the corresponding actual proteins of those are re-aligned in p2g -E yes mode. Returned exonerate object, holds thoses (few-best) predictions -> needs for further filtering.''' query_fasta = data_base.group_by_cluster_to_fasta_file[group][cluster] query_hash = data_base.group_by_cluster_to_fasta_hash[group][cluster] hmm = data_base.group_by_cluster_to_hmm[group][cluster] cutoff = data_base.group_by_cluster_to_score_cutoff[group][cluster] length_range = data_base.group_by_cluster_to_length_range[group][cluster] with tmp.NamedTemporaryFile() as reg_file: write_to_tempfile(reg_file.name, region_fasta) ex_obj = run_exonerate("-m p2g -E no", "{}.exon_p2g", dir_path, reg_file.name, query_fasta) try: TP_scores = markov_model_scoring( all_proteins_to_fasta_string(ex_obj), hmm) if not quick: max_score = max(list(TP_scores.values())) max_header = set([ header.split(";")[0] for header, score in TP_scores.items() if score == max_score ]) with tmp.NamedTemporaryFile() as q_file: max_val_fasta = "\n".join([ "{}\n{}".format(header, query_hash[header]) for header in max_header ]) write_to_tempfile(q_file.name, max_val_fasta) ex_obj = run_exonerate("-m p2g -E yes", "{}.exon".format(q_file.name), dir_path, reg_file.name, q_file.name) TP_scores = markov_model_scoring( all_proteins_to_fasta_string(ex_obj), hmm) best_pred = [] for key_tuple in ex_obj.target_dna: score_id = ">" + key_tuple.query + ";{}".format(key_tuple.idx) pred_obj = PredictionObject(region_tuple, TP_scores[score_id], cluster, cutoff, length_range) pred_obj.infer_data_from_exonerate_obj(ex_obj, key_tuple) best_pred.append(pred_obj) filtered, passed = isolate_overlapping_predictions( sorted(best_pred, key=lambda x: x.score, reverse=True)) return ex_obj, passed except (AttributeError, TypeError, KeyError): return None, None
def iterative_score_computation(self): for idx in range(0, len(self.fasta_hash.keys())): rest_prot = list(self.fasta_hash.keys()) query = rest_prot.pop(idx) with tmp.NamedTemporaryFile() as q_tmp: write_to_tempfile( q_tmp.name, query.split()[0] + "\n" + self.fasta_hash[query]) msa_string = self.generate_msa_string(rest_prot) with tmp.NamedTemporaryFile() as msa_tmp: write_to_tempfile(msa_tmp.name, msa_string) with tmp.NamedTemporaryFile() as hmm_tmp: generate_hmm(hmm_tmp.name, msa_tmp.name) try: score_dict = get_phmm_score( hmm_tmp.name, q_tmp.name) except IndexError: continue self.score_dict.update(score_dict) return self.score_dict