示例#1
0
def config_taxonomy_own_seq(taxonomy_str):
    # Function parses ID of user's reference sequence and forms a taxonomy tuple
    #   if there is proper taxonomy string in ID line in fasta format.
    # "Proper" taxonomy string is following:
    #   '[ANYTHING BEFORE] <Domain>;<Phylum>;<Class>;<Order>;<Family>;<Genus>;<species> [ANYTHING AFTER]'
    # Spaces are not allowed. Ranks can be omitted in manner like this
    #   (order and species is missing):
    #   '[ANYTHING BEFORE] <Domain>;<Phylum>;<Class>;;<Family>;<Genus>; [ANYTHING AFTER]'
    # If there is no taxonomy string in sequence ID, we'll merely save this ID to taxonomy file.
    #
    # :param taxonomy_str: taxonomy string to parse;
    # :type taxonomy_str: str;
    #
    # Returns taxonomy string.

    # Check if `taxonomy_str` matches `proposed_fmt`
    proper_tax_match = re.search(proposed_fmt, taxonomy_str)

    # If there is a match and it taxonomic names are not empty,
    #   form taxonomic tuple:
    if not proper_tax_match is None and proper_tax_match.group(
            0) != ";" * (len(ranks) - 1):
        taxonomy = proper_tax_match.group(0)
    # Otherwise we will merely use this sequence ID
    else:
        taxonomy = remove_bad_chars(taxonomy_str)
    # end if

    return taxonomy
示例#2
0
def parse_align_results_xml(xml_text, qual_dict, acc_dict, taxonomy_path):
    # Function parses BLAST xml response and returns tsv lines containing gathered information:
    #   1. Query name.
    #   2. Hit name formatted by 'format_taxonomy_name()' function.
    #   3. Hit accession.
    #   4. Length of query sequence.
    #   5. Length of alignment.
    #   6. Percent of identity.
    #   7. Percent of gaps.
    #   8. E-value.
    #   9. Average quality of a read (if source file is FASTQ).
    #   10. Read accuracy (%) (if source file is FASTQ).
    #
    # :param xml_text: XML text with results of alignment;
    # :type xml_text: str;
    # :param qual_dict: dict, which maps sequence IDs to their quality;
    # :type qual_dict: dict<str: float>;
    # :param acc_dict: dictionary comntaining accession data of hits;
    # :type acc_dict: dict<str: tuple<str, str, int>>;
    # :param taxonomy_path: path to DBM file with taxonomy;
    # :type taxonomy_path: str;
    #
    # Returns list<str>.

    result_tsv_lines = list()

    # /=== Parse BLAST XML response ===/

    root = ElementTree.fromstring(xml_text)  # get tree instance

    # Iterate over "Iteration" and "Iteration_hits" nodes
    for iter_elem, iter_hit in zip(root.iter("Iteration"),
                                   root.iter("Iteration_hits")):
        # "Iteration" node contains query name information
        query_name = sys.intern(iter_elem.find("Iteration_query-def").text)
        query_len = iter_elem.find("Iteration_query-len").text

        avg_quality = qual_dict[query_name]
        if avg_quality != '-':
            miscall_prop = round(10**(avg_quality / -10), 3)
            accuracy = round(100 * (1 - miscall_prop),
                             2)  # expected percent of correctly called bases
            qual_info_to_print = "  Average quality of this read is {}, i.e. accuracy is {}%;\n".format(
                avg_quality, accuracy)
        else:
            # If FASTA file is processing, print dashed in quality columns
            avg_quality = "-"
            accuracy = "-"  # expected percent of correctly called bases
            qual_info_to_print = ""
        # end if

        # Check if there are any hits
        chck_h = iter_hit.find("Hit")

        if chck_h is None:
            # If there is no hit for current sequence
            print(
                "\n{} -- No significant similarity found;\n    Query length - {};"
                .format(query_name, query_len))
            result_tsv_lines.append('\t'.join(
                (query_name, "No significant similarity found", "-", query_len,
                 "-", "-", "-", "-", str(avg_quality), str(accuracy))))
        else:
            # If there are any hits, node "Iteration_hits" contains at least one "Hit" child
            # Get first-best bitscore and iterato over hits that have the save (i.e. the highest bitscore):
            top_bitscore = next(
                chck_h.find("Hit_hsps").iter("Hsp")).find("Hsp_bit-score").text

            annotations = list()
            hit_accs = list()

            for hit in iter_hit:

                # Find the first HSP
                hsp = next(hit.find("Hit_hsps").iter("Hsp"))

                if hsp.find("Hsp_bit-score").text != top_bitscore:
                    break
                # end if

                # Get full hit name (e.g. "Erwinia amylovora strain S59/5, complete genome")
                hit_def = remove_bad_chars(hit.find("Hit_def").text)
                annotations.append(hit_def)

                curr_acc = sys.intern(hit.find("Hit_accession").text)
                hit_accs.append(curr_acc)  # get hit accession

                # Get taxonomy
                find_taxonomy(curr_acc, hit_def, taxonomy_path)

                # Update accession dictionary
                try:
                    acc_dict[curr_acc][1] += 1
                except KeyError:
                    acc_dict[curr_acc] = [hit_def, 1]
                # end try

                align_len = hsp.find("Hsp_align-len").text.strip()
                pident = hsp.find(
                    "Hsp_identity").text  # get number of matched nucleotides
                gaps = hsp.find("Hsp_gaps").text  # get number of gaps

                evalue = hsp.find("Hsp_evalue").text  # get e-value
                pident_ratio = round(float(pident) / int(align_len) * 100, 2)
                gaps_ratio = round(float(gaps) / int(align_len) * 100, 2)
            # end for

            # Divide annotations and accessions with '&&'
            annotations = '&&'.join(annotations)
            hit_accs = '&&'.join(hit_accs)

            print("""\n{} - {}
  Query length - {} nt;
  Identity - {}/{} ({}%); Gaps - {}/{} ({}%);""".format(
                query_name, annotations, query_len, pident, align_len,
                pident_ratio, gaps, align_len, gaps_ratio))

            # Append new tsv line containing recently collected information
            result_tsv_lines.append('\t'.join(
                (query_name, annotations, hit_accs, query_len, align_len,
                 pident, gaps, evalue, str(avg_quality), str(accuracy))))

        # end if
        printn(qual_info_to_print)
    # end for

    return result_tsv_lines
示例#3
0
def parse_align_results_xml(xml_text, qual_dict):
    # Function parses BLAST xml response and returns tsv lines containing gathered information:
    #     1. Query name.
    #     2. Hit name formatted by 'format_taxonomy_name()' function.
    #     3. Hit accession.
    #     4. Length of query sequence.
    #     5. Length of alignment.
    #     6. Percent of identity.
    #     7. Percent of gaps.
    #     8. E-value.
    #     9. Average Phred33 quality of a read (if source file is FASTQ).
    #     10. Read accuracy (%) (if source file is FASTQ).
    #
    # :param xml_text: XML text with results of alignment;
    # :type xml_text: str;
    # :param qual_dict: dict, which maps sequence IDs to their quality;
    # :type qual_dict: dict<str: float>;
    #
    # Returns list<str>.

    result_tsv_lines = list()

    # /=== Parse BLAST XML response ===/

    root = ElementTree.fromstring(xml_text)  # get tree instance

    # Iterate over "Iteration" and "Iteration_hits" nodes
    for iter_elem, iter_hit in zip(root.iter("Iteration"),
                                   root.iter("Iteration_hits")):

        # "Iteration" node contains query name information
        query_name = iter_elem.find("Iteration_query-def").text
        query_len = iter_elem.find("Iteration_query-len").text

        avg_quality = qual_dict[query_name]
        if avg_quality != '-':
            miscall_prop = round(10**(avg_quality / -10), 3)
            accuracy = round(100 * (1 - miscall_prop),
                             2)  # expected percent of correctly called bases
        else:
            # If FASTA file is processing, print dashed in quality columns
            avg_quality = "-"
            accuracy = "-"  # expected percent of correctly called bases
        # end if

        # Check if there are any hits
        chck_h = iter_hit.find("Hit")

        if chck_h is None:
            # If there is no hit for current sequence
            result_tsv_lines.append('\t'.join(
                (query_name, "No significant similarity found", "-", query_len,
                 "-", "-", "-", "-", str(avg_quality), str(accuracy))))
        else:
            # If there are any hits, node "Iteration_hits" contains at least one "Hit" child
            # Get first-best bitscore and iterato over hits that have the save (i.e. the highest bitscore):
            top_bitscore = next(
                chck_h.find("Hit_hsps").iter("Hsp")).find("Hsp_bit-score").text

            annotations = list()
            hit_accs = list()

            for hit in iter_hit:

                # Find the first HSP (we need only the first one)
                hsp = next(hit.find("Hit_hsps").iter("Hsp"))

                if hsp.find("Hsp_bit-score").text != top_bitscore:
                    break
                # end if

                curr_acc = sys.intern(
                    hit.find("Hit_accession").text)  # get hit accession
                hit_accs.append(curr_acc)

                # Get full hit name (e.g. "Erwinia amylovora strain S59/5, complete genome")
                hit_def = remove_bad_chars(hit.find("Hit_def").text)
                annotations.append(hit_def)

                align_len = hsp.find("Hsp_align-len").text.strip()
                pident = hsp.find(
                    "Hsp_identity").text  # get number of matched nucleotides
                gaps = hsp.find("Hsp_gaps").text  # get number of gaps

                evalue = hsp.find("Hsp_evalue").text  # get e-value
            # end for

            # Divide annotations and accessions with '&&'
            annotations = '&&'.join(annotations)
            hit_accs = '&&'.join(hit_accs)

            # Append new tsv line containing recently collected information
            result_tsv_lines.append('\t'.join(
                (query_name, annotations, hit_accs, query_len, align_len,
                 pident, gaps, evalue, str(avg_quality), str(accuracy))))
        # end if
    # end for

    return result_tsv_lines
示例#4
0
def build_local_db(tax_annot_res_dir, acc_fpath, your_own_fasta_lst,
                   accs_to_download, use_index):
    # Function creates a database with utilities from 'blast+' toolkit
    #     according to acc_dict and your_own_fasta_lst.
    #
    # :param tax_annot_res_dir: path to current result directory
    #   (each processed file has it's own result directory);
    # :type tax_annot_res_dir: str;
    # :param acc_fpath: path to file "hits_to_download.tsv";
    # :type acc_fpath: str;
    # :param your_own_fasta_lst: list of user's fasta files to be included in database;
    # :type your_own_fasta_lst: list<str>;
    # :param accs_to_download: list of accessions from command line ('-s');
    # :type accs_to_download: list<str>;
    # :param use_index: whether to use index;
    # :type use_index: str;

    # Returns path to created database.

    # Path to directory in which database will be placed
    db_dir = os.path.join(tax_annot_res_dir, "local_database")
    # Path to DBM taxonomy file
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")

    try:
        os.makedirs(db_dir)
    except OSError:
        #If this directory exists

        while True:
            if len(os.listdir(db_dir)) == 0:
                # If db directory is empty -- break and build a database
                break
            else:
                print()
                printlog_info("Database directory is not empty:")
                printlog_info("  `{}`".format(os.path.abspath(db_dir)))
                printlog_info("Here is it's content:")
                for i, fname in enumerate(os.listdir(os.path.abspath(db_dir))):
                    printlog_info(" {}. `{}`".format(i + 1, fname))
                # end for
                reply = input(
                    """\nPress ENTER to start classification using existing database.
Enter 'r' to remove all files in this directory and create the database from the beginning:>>"""
                )

                if reply == "":
                    # Do not build a database, just return path to it.
                    printlog_info("You have chosen to use extant database.")

                    # Return path to DB located in this directory
                    dbpath = next(iter(os.listdir(db_dir)))
                    dbpath = dbpath.partition(".fasta")[0] + dbpath.partition(
                        ".fasta")[1]  # remove all after '.fasta'

                    return os.path.join(db_dir, dbpath)

                elif reply == 'r':

                    printlog_info("You have chosen to rebuild the database.")
                    # Rename old classification files and write actual data to new one:
                    old_classif_dirs = filter(
                        lambda d: os.path.exists(
                            os.path.join(d, "classification.tsv")),
                        glob(os.path.join(tax_annot_res_dir, "*")))
                    old_classif_files = tuple(
                        map(lambda f: os.path.join(f, "classification.tsv"),
                            old_classif_dirs))

                    if len(old_classif_files) > 0:
                        print()
                        printlog_info("Renaming old classification files:")
                        for classif_file in old_classif_files:
                            rename_file_verbosely(classif_file)
                        # end for
                    # end if

                    # Empty database directory
                    for file in glob("{}{}*".format(db_dir, os.sep)):
                        os.unlink(file)
                    # end for

                    # Break from the loop in order to build a database
                    break
                else:
                    print("Invalid reply: `{}`\n".format(reply))
                    continue
                # end if
            # end if
        # end while
    # end try

    # It is a dictionary of accessions and record names.
    # Accessions are keys, record names are values.
    acc_dict = configure_acc_dict(acc_fpath, your_own_fasta_lst,
                                  accs_to_download)

    if len(accs_to_download) != 0:
        verify_cl_accessions(accs_to_download, acc_dict)
    # end if

    # Retrieve already existing taxonomy data from taxonomy file
    tax_exist_accs = taxonomy.get_tax_keys(taxonomy_path)

    # If accession file does not exist and execution has reached here -- everything is OK --
    #    we are building a database from user's files only.
    if len(acc_dict) != 0:
        print()

        print("""Following sequences (and all replicons related to them)
  will be downloaded from Genbank for further taxonomic classification
  on your local machine:\n""")
        printlog_info(
            "Following sequences (and all replicons related to them) \
will be downloaded from Genbank for further taxonomic classification \
on your local machine:")
        for i, acc in enumerate(acc_dict.keys()):
            printlog_info(" {}. {} - `{}`".format(i + 1, acc, acc_dict[acc]))
        # end for

        search_for_related_replicons(acc_dict)

        printlog_info_time("Completing taxonomy file...")
        for i, acc in enumerate(acc_dict.keys()):
            if not acc in tax_exist_accs:
                taxonomy.find_taxonomy(acc, acc_dict[acc][1], taxonomy_path)
            # end if
            # Accessions can be of different length
            printn("\r{} - {}: {}/{}".format(getwt(), acc, i +
                                             1, len(acc_dict)) + " " * 10 +
                   "\b" * 10)
        # end for
        print()
        printlog_info_time("Taxonomy file is consistent.")
    # end if

    local_fasta = os.path.join(
        db_dir, "local_seq_set.fasta")  # path to downloaded FASTA file

    add_lambda_phage(local_fasta,
                     taxonomy_path)  # add lambda phage control sequence

    retrieve_fastas_by_acc(
        acc_dict, db_dir, local_fasta)  # download main fasta data from GenBank

    # Add 'your own' fasta files to database
    if not len(your_own_fasta_lst) == 0:

        # This variable counts sequences from local files.
        # It is necessary for not allowing duplicated accessions.
        own_seq_counter = 0

        # Check if these files are assembly made by SPAdes or a5
        spades_patt = r">NODE_[0-9]+"  # this pattern will match sequence IDs generated y SPAdes
        a5_patt = r">scaffold_[0-9]+"  # this pattern will match sequence IDs generated y a5
        assemblies = list(
        )  # this list will contain paths to assembly files (SPAdes or a5)

        for own_fasta_path in reversed(your_own_fasta_lst):

            how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)]
            fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)]

            with how_to_open(own_fasta_path) as fasta_file:
                first_seq_id = fmt_func(fasta_file.readline(
                ))  # get the first line in file (the first seq ID)
            # end with

            # if we've got SPAdes assembly
            if not re.search(spades_patt, first_seq_id) is None:
                assemblies.append(own_fasta_path)
                # Remove these file from list -- they will be processed in a specific way
                your_own_fasta_lst.remove(own_fasta_path)
                continue
            # end if

            # if we've got a5 assembly
            if not re.search(a5_patt, first_seq_id) is None:
                assemblies.append(own_fasta_path)
                your_own_fasta_lst.remove(own_fasta_path)
                continue
            # end if
        # end for

        # Include assemblies files in multi-fasta file

        # Find common prefix of all assembly paths and remove it from assembly names
        if len(assemblies) > 1:
            assemblies_formatted = tuple(
                map(lambda f: os.path.abspath(f).replace(os.sep, '-'),
                    assemblies))
            common_prefix = find_common_prefix(assemblies_formatted)
            assemblies_formatted = tuple(
                map(lambda f: f.replace(common_prefix, ''),
                    assemblies_formatted))
        elif len(assemblies) > 0:
            common_prefix = ''
            assemblies_formatted = tuple(map(os.path.basename, assemblies))
        # end if

        # Add assembled sequences to database
        with open(local_fasta, 'a') as fasta_db:
            for i, assm_path in enumerate(assemblies):
                printlog_info("Adding `{}` to database...".format(
                    os.path.basename(assm_path)))
                assm_name_fmt = assemblies_formatted[i]

                how_to_open = OPEN_FUNCS[is_gzipped(assm_path)]
                fmt_func = FORMATTING_FUNCS[is_gzipped(assm_path)]
                with how_to_open(assm_path) as fasta_file:
                    for line in fasta_file:
                        line = fmt_func(line)
                        # You can find comments to "OWN_SEQ..." below.
                        # Paths will be written to seq IDs in following way:
                        #   some-happy-path.fastq--
                        # in order to retrieve them securely with regex later.
                        if line.startswith('>'):
                            own_seq_counter += 1
                            own_acc = "OWN_SEQ_{}".format(own_seq_counter)
                            own_def = "{}--".format(
                                assm_name_fmt.replace(common_prefix,
                                                      '')) + line[1:]
                            own_def = remove_bad_chars(own_def)
                            taxonomy.save_taxonomy_directly(
                                taxonomy_path, own_acc, own_def)
                            line = ">" + "{} {}".format(own_acc, own_def)
                        # end if
                        fasta_db.write(line + '\n')
                    # end for
                # end with
            # end for
        # end with

        with open(local_fasta, 'a') as fasta_db:
            for own_fasta_path in your_own_fasta_lst:
                printlog_info("Adding `{}` to database...".format(
                    os.path.basename(own_fasta_path)))

                how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)]
                fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)]
                with how_to_open(own_fasta_path) as fasta_file:
                    for line in fasta_file:
                        line = fmt_func(line)
                        # 'makeblastdb' considers first word (sep. is space) as sequence ID
                        #   and throws an error if there are duplicated IDs.
                        # In order not to allow this duplication we'll create our own sequence IDs:
                        #   'OWN_SEQ_<NUMBER>' and write it in the beginning of FASTA record name.
                        if line.startswith('>'):
                            own_seq_counter += 1
                            own_acc = "OWN_SEQ_{}".format(own_seq_counter)
                            taxonomy.save_taxonomy_directly(
                                taxonomy_path, own_acc, line[1:])
                            line = ">" + own_acc + ' ' + remove_bad_chars(
                                line[1:])
                        # end if
                        fasta_db.write(line + '\n')
                    # end for
                # end with
            # end for
        # end with
    # end if

    # 'lcl|ACCESSION...' entries can be given with '.1'
    #   (or '.2', whatever) terminus by blastn.
    # There is no '.1' terminus in taxonomy file.
    # Therefore we will prune accessions in advance.
    print()
    printn("{} - Formatting accessions...".format(getwt()))
    log_info("Formatting accessions...")
    corrected_path = os.path.join(db_dir, "corrected_seqs.fasta")
    with open(local_fasta, 'r') as source_file, open(corrected_path,
                                                     'w') as dest_file:
        for line in source_file:
            if line.startswith('>'):
                line = line.strip()
                acc, seq_name = (line.partition(' ')[0],
                                 line.partition(' ')[2])
                acc = acc.partition('.')[0]
                seq_name = remove_bad_chars(seq_name)
                seq_name = re.sub(r'[^\x00-\x7F]+', '_',
                                  seq_name)  # remove non-ascii chars
                line = ' '.join((acc, seq_name)) + '\n'
            # end if
            dest_file.write(line)
        # end for
    # end with
    os.unlink(local_fasta)
    os.rename(corrected_path, local_fasta)
    sys.stdout.write("\r{} - Formatting accessions... ok".format(getwt()))
    log_info("Formatting accessions done.")

    # Configure command line
    make_db_cmd = "makeblastdb -in {} -parse_seqids -dbtype nucl".format(
        local_fasta)
    exit_code = os.system(make_db_cmd)  # make a blast-format database
    if exit_code != 0:
        printlog_error_time("Error occured while making the database")
        platf_depend_exit(exit_code)
    # end if

    print("\033[1A{} - Database is successfully created: `{}`\n".format(
        getwt(), local_fasta))
    log_info("Database is successfully created: `{}`".format(local_fasta))

    if use_index == "true":
        printlog_info_time("Database index creating started")
        # Configure command line
        make_index_cmd = "makembindex -input {} -iformat blastdb -verbosity verbose".format(
            local_fasta)
        exit_code = os.system(
            make_index_cmd)  # create an index for the database
        if exit_code != 0:
            printlog_info_time("Error occured while creating database index")
            platf_depend_exit(exit_code)
        # end if

        printlog_info_time("Database index has been successfully created")
    # end if

    # Gzip downloaded FASTA file
    printlog_info_time("Gzipping FASTA file: `{}`".format(local_fasta))

    if gzip_util_found:
        os.system("{} -v {}".format(gzip_util, local_fasta))
    else:
        # form .fasta.gz file 'by hand'
        with open(local_fasta,
                  'rb') as fasta_file, open_as_gzip(local_fasta + ".gz",
                                                    "wb") as fagz_file:
            shutil_copyfileobj(fasta_file, fagz_file)
        # end with
        os.unlink(local_fasta)  # remove source FASTA file, not the database
    # end if

    return local_fasta