Exemplo n.º 1
0
def best_exonerate_prediction2(region_fasta, query_fasta, dir_path, hmm):
    with tmp.NamedTemporaryFile() as reg_file:
        write_to_tempfile(reg_file.name, region_fasta)
        ex_obj = run_exonerate("-m p2g -E no", "{}.exon_p2g", dir_path,
                               region_fasta, query_fasta)
        if ex_obj:
            all_proteins = all_proteins_to_fasta_string(ex_obj)
            TP_scores = markov_model_scoring2(all_proteins, hmm)
            if TP_scores:
                print(TP_scores)
                max_score = max(list(TP_scores.values()))
                max_score_header = set([
                    header.split(";")[0]
                    for header, score in TP_scores.items()
                    if score >= max_score * 0.90
                ])
                fasta_hash = hash_fasta(query_fasta)  # do that before
                with tmp.NamedTemporaryFile() as q_file:
                    max_val_fasta = "\n".join([
                        "{}\n{}".format(header, fasta_hash[header])
                        for header in max_score_header
                    ])
                    write_to_tempfile(q_file.name, max_val_fasta)
                    ex_name = "{}.exon".format(q_file.name)
                    ex_obj = run_exonerate("-m p2g -E no ", ex_name, dir_path,
                                           region_fasta, q_file.name)
                    return ex_obj
    return None
Exemplo n.º 2
0
class TestHashFasta(unittest.TestCase):

    fa_hash = shared_code_box.hash_fasta(os.path.join(test_data, file_name))

    @patch("build_models.hash_fasta", return_value={">a": 1, ">b": 2})
    @patch("build_models.logger_Filtered", return_value=None)
    def test_check_and_hash_fasta(self, logger, fasta_hash):
        fasta_hash = build_models.check_and_hash_fasta(os.path.join(test_data, file_name), "filename")
        self.assertEqual(fasta_hash, None)

    @patch("build_models.hash_fasta", return_value={})
    @patch("build_models.logger_Filtered", return_value=None)
    def test_check_and_hash_fasta(self, logger, fasta_hash):
        fasta_hash = build_models.check_and_hash_fasta(os.path.join(test_data, file_name), "filename")
        self.assertEqual(fasta_hash, None)

    @patch("build_models.hash_fasta", return_value={">a": 1, ">b": 2, ">c": 3})
    @patch("build_models.logger_Filtered", return_value=None)
    def test_check_and_hash_fasta(self, logger, fasta_hash):
        fasta_hash = build_models.check_and_hash_fasta(os.path.join(test_data, file_name), "filename")
        self.assertEqual(fasta_hash, {">a": 1, ">b": 2, ">c": 3})

    def test_hash_fasta_equal_number_header_seq(self):
        number_seq = len(self.fa_hash.keys())
        number_header = len(self.fa_hash.values())
        self.assertEqual(number_header, number_seq)

    def test_hash_fasta_number_of_entries(self):
        number_entries = len(self.fa_hash.keys())
        self.assertEqual(number_entries, 190)

    def test_hash_fasta_correct_seq_length(self):
        single_seq = len(self.fa_hash[">AMELL.GB42352-PA"])
        self.assertEqual(single_seq, 942)
Exemplo n.º 3
0
def check_and_hash_fasta(fasta_file, file_name):
    """wrapper for hash_fasta function to check if hash has sufficient entries"""
    fasta_dict = hash_fasta(fasta_file)
    if fasta_dict and len(fasta_dict) <= 2:
        logger_Filtered.warning(
            "INPUT ERROR: '{}' - has less than 2 sequences\n".format(
                file_name))
        return None
    else:
        return fasta_dict
Exemplo n.º 4
0
    true_negative_file = args['--orthofinder_files']
    check_programs("hmmsearch", "hmmemit", "hmmbuild", "mafft", "trimal")

    print("\n{}\n# GenePS #\n{}\n\nPreparing Files...\n".format(
        "#" * 10, "#" * 10))
    # if true negative translation files provided
    blast_specID_protID_hitList = defaultdict(lambda: defaultdict(list))
    idPair_2_namePair, namePair_2_idPair = {}, {}
    all_protein_fasta_dict = {}

    if true_negative_file:
        tn_args = parse_true_negative_arg(true_negative_file)
        blast_path, blast_file_set = get_blast_files(tn_args["blast_dir"])
        idPair_2_namePair, namePair_2_idPair = hash_sequence_translation_file(
            tn_args["sequenceIDs"])
        all_protein_fasta_dict = hash_fasta(tn_args["proteins"])
        species_ids = hash_species_translation_file(tn_args["speciesIDs"])
        number_blast_files = hash_all_blast_files(species_ids)
        if not number_blast_files - len(species_ids) == 0:
            print("\t[!] FATAL ERROR: Not all Blast files could be hashed\n")
            sys.exit()
    if keep:
        keep_dir = get_outdir(output_dir, add_dir="intermediate_files")
    log_path = os.path.join(output_dir, "LOG.txt")
    logging.basicConfig(
        level=logging.DEBUG,
        format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
        datefmt='%m-%d %H:%M',
        filename=log_path,
        filemode='w')
Exemplo n.º 5
0
    def load_data_and_initialize_global_variables(self):
        def mod_next():
            return next(mg).strip().split(":")

        cluster_count = 0
        input_scope = 0
        error_list = []
        for subdir, dirs, files in os.walk(self.gene_ps_results):
            for file_path_str in files:
                if file_path_str.split(".")[-1] == "GenePS" and len(
                        file_path_str.split(".")) > 1:
                    group_file = os.path.join(subdir, file_path_str)
                    with open(group_file) as mg:
                        group_name = mg.readline().split(":")[1].strip()
                        self.group_names.append(group_name)
                        self.group_to_consensus_file[
                            group_name] = os.path.join(
                                self.gene_ps_results,
                                group_name + ".fa.consensus")
                        self.group_to_original_path[group_name] = group_file
                        self.group_to_group_size[group_name] = int(
                            mg.readline().split(":")[1].strip())
                        input_scope += self.group_to_group_size[group_name]
                        for line in mg:
                            if line.startswith("#name:"):
                                cluster = line.split(":")[1].strip()
                                cluster_count += 1
                                score = mod_next()[1].strip().split(",")
                                self.group_by_cluster_to_score_cutoff[
                                    group_name][cluster] = float(score[0])
                                self.group_by_cluster_to_length_range[
                                    group_name][cluster] = mod_next()[1].strip(
                                    ).split(",")
                                if os.path.exists(
                                        os.path.join(self.gene_ps_results,
                                                     cluster + ".TN_hmm")):
                                    self.group_by_cluster_to_TN_hmm[
                                        group_name][cluster] = os.path.join(
                                            self.gene_ps_results,
                                            cluster + ".TN_hmm")
                                else:
                                    self.group_by_cluster_to_TN_hmm[
                                        group_name][cluster] = None
                                files_not_found = self.validate_path_files(
                                    cluster)
                                if not files_not_found:
                                    self.group_by_cluster_to_hmm[group_name][
                                        cluster] = os.path.join(
                                            self.gene_ps_results,
                                            cluster + ".hmm")
                                    self.group_by_cluster_to_fasta_file[
                                        group_name][cluster] = os.path.join(
                                            self.gene_ps_results,
                                            cluster + ".fasta")
                                    self.group_by_cluster_to_fasta_hash[
                                        group_name][cluster] = hash_fasta(
                                            self.
                                            group_by_cluster_to_fasta_file[
                                                group_name][cluster])
                                else:
                                    error_list += files_not_found
                            else:
                                pass
        return self.check_loaded_data(cluster_count, input_scope, error_list)
Exemplo n.º 6
0
    iterations = int(args['--iterations'])
    repeat_limit = int(args['--repeat_limit'])
    out_dir = args['--output_path']
    out_file = open(out_dir + ".txt", "w")
    # iterations = how often gets the dataset enlarged
    # number_prot = number of proteins added to the dataset per iteration
    # iterations * numbers = total amount of protein seq
    # per iteration get average scores for each model till iter * number_prot is reached
    # results in an "average-list" for both models

    # for each repeat step, repeat this computation form above
    # for each repeat step one average_list
    # average the average_lists
    # plot average of average and min of average and max of average for both models

    cluster_hash = hash_fasta(cluster_file)
    x_axis = [number_prot * x for x in range(1, iterations + 1)]

    repeat_step = 1
    seq_header_list = []
    bulk_av_list = []
    iter_av_list = []
    while repeat_step <= repeat_limit:
        av_single_hmm, av_multiple_hmm = compare_models(
            iterations, number_prot)
        print("{} round of {}".format(str(repeat_step), str(repeat_limit)))
        print(av_single_hmm)
        print(av_multiple_hmm)
        bulk_av_list.append(av_single_hmm)
        iter_av_list.append(av_multiple_hmm)
        seq_header_list = []