예제 #1
0
 def compute_full_phmm(self, location):
     """realignes the fasta file stored in this object but to a different location"""
     msa_string = self.generate_msa_string(self.fasta_hash.keys())
     with tmp.NamedTemporaryFile() as msa_tmp:
         write_to_tempfile(msa_tmp.name, msa_string)
         generate_hmm(location, msa_tmp.name)
     return location
예제 #2
0
def best_exonerate_prediction2(region_fasta, query_fasta, dir_path, hmm):
    with tmp.NamedTemporaryFile() as reg_file:
        write_to_tempfile(reg_file.name, region_fasta)
        ex_obj = run_exonerate("-m p2g -E no", "{}.exon_p2g", dir_path,
                               region_fasta, query_fasta)
        if ex_obj:
            all_proteins = all_proteins_to_fasta_string(ex_obj)
            TP_scores = markov_model_scoring2(all_proteins, hmm)
            if TP_scores:
                print(TP_scores)
                max_score = max(list(TP_scores.values()))
                max_score_header = set([
                    header.split(";")[0]
                    for header, score in TP_scores.items()
                    if score >= max_score * 0.90
                ])
                fasta_hash = hash_fasta(query_fasta)  # do that before
                with tmp.NamedTemporaryFile() as q_file:
                    max_val_fasta = "\n".join([
                        "{}\n{}".format(header, fasta_hash[header])
                        for header in max_score_header
                    ])
                    write_to_tempfile(q_file.name, max_val_fasta)
                    ex_name = "{}.exon".format(q_file.name)
                    ex_obj = run_exonerate("-m p2g -E no ", ex_name, dir_path,
                                           region_fasta, q_file.name)
                    return ex_obj
    return None
예제 #3
0
def markov_model_scoring2(fasta_string, hmm):
    if hmm:
        with tmp.NamedTemporaryFile() as ex_file:
            write_to_tempfile(ex_file.name, fasta_string)
            score_hash = get_phmm_score(hmm, ex_file.name)
            if score_hash:
                return score_hash
    return None
예제 #4
0
 def generate_msa_string(self, rest_prot):
     """returns a MSA as string in fasta format, derived from the a list of headers"""
     seq_list = [
         "{}\n{}".format(header, self.fasta_hash[header])
         for header in rest_prot
     ]
     with tmp.NamedTemporaryFile() as r_tmp:
         write_to_tempfile(r_tmp.name, "\n".join(seq_list))
         list_msa = generate_msa(r_tmp.name)
     return "\n".join(list_msa)
예제 #5
0
def bulk_hmm_scoring_model():
    with tempfile.NamedTemporaryFile() as tmp:
        seq_list = []
        for header in seq_header_list:
            seq_list.append(header)
            seq_list.append(cluster_hash[header])
        write_to_tempfile(tmp.name, "\n".join(seq_list))
        msa = "\n".join(generate_msa(tmp.name))
        with tempfile.NamedTemporaryFile() as msa_tmp:
            write_to_tempfile(msa_tmp.name, msa)
            with tempfile.NamedTemporaryFile() as tmp_hmm:
                hmm = generate_hmm(tmp_hmm.name, msa_tmp.name)
                score_list = many_hmm_scores(hmm, tmp.name)
                return score_list
예제 #6
0
def find_best_exonerate_result(region_tuple, region_fasta, group, cluster,
                               dir_path):
    '''first aligns all proteins in p2g mode, scores them against the hmm and extracts the highest scoring predictions.
    Then the corresponding actual proteins of those are re-aligned in p2g -E yes mode. Returned exonerate object,
    holds thoses (few-best) predictions -> needs for further filtering.'''
    query_fasta = data_base.group_by_cluster_to_fasta_file[group][cluster]
    query_hash = data_base.group_by_cluster_to_fasta_hash[group][cluster]
    hmm = data_base.group_by_cluster_to_hmm[group][cluster]
    cutoff = data_base.group_by_cluster_to_score_cutoff[group][cluster]
    length_range = data_base.group_by_cluster_to_length_range[group][cluster]
    with tmp.NamedTemporaryFile() as reg_file:
        write_to_tempfile(reg_file.name, region_fasta)
        ex_obj = run_exonerate("-m p2g -E no", "{}.exon_p2g", dir_path,
                               reg_file.name, query_fasta)
        try:
            TP_scores = markov_model_scoring(
                all_proteins_to_fasta_string(ex_obj), hmm)
            if not quick:
                max_score = max(list(TP_scores.values()))
                max_header = set([
                    header.split(";")[0]
                    for header, score in TP_scores.items()
                    if score == max_score
                ])
                with tmp.NamedTemporaryFile() as q_file:
                    max_val_fasta = "\n".join([
                        "{}\n{}".format(header, query_hash[header])
                        for header in max_header
                    ])
                    write_to_tempfile(q_file.name, max_val_fasta)
                    ex_obj = run_exonerate("-m p2g -E yes",
                                           "{}.exon".format(q_file.name),
                                           dir_path, reg_file.name,
                                           q_file.name)
                    TP_scores = markov_model_scoring(
                        all_proteins_to_fasta_string(ex_obj), hmm)
            best_pred = []
            for key_tuple in ex_obj.target_dna:
                score_id = ">" + key_tuple.query + ";{}".format(key_tuple.idx)
                pred_obj = PredictionObject(region_tuple, TP_scores[score_id],
                                            cluster, cutoff, length_range)
                pred_obj.infer_data_from_exonerate_obj(ex_obj, key_tuple)
                best_pred.append(pred_obj)
            filtered, passed = isolate_overlapping_predictions(
                sorted(best_pred, key=lambda x: x.score, reverse=True))
            return ex_obj, passed
        except (AttributeError, TypeError, KeyError):
            return None, None
예제 #7
0
 def iterative_score_computation(self):
     for idx in range(0, len(self.fasta_hash.keys())):
         rest_prot = list(self.fasta_hash.keys())
         query = rest_prot.pop(idx)
         with tmp.NamedTemporaryFile() as q_tmp:
             write_to_tempfile(
                 q_tmp.name,
                 query.split()[0] + "\n" + self.fasta_hash[query])
             msa_string = self.generate_msa_string(rest_prot)
             with tmp.NamedTemporaryFile() as msa_tmp:
                 write_to_tempfile(msa_tmp.name, msa_string)
                 with tmp.NamedTemporaryFile() as hmm_tmp:
                     generate_hmm(hmm_tmp.name, msa_tmp.name)
                     try:
                         score_dict = get_phmm_score(
                             hmm_tmp.name, q_tmp.name)
                     except IndexError:
                         continue
         self.score_dict.update(score_dict)
     return self.score_dict