예제 #1
0
            def download_waiter(stop_wait):
                """
                Function waits untill 'local_fasta' file is downloaded.
                It prints size of downloaded data to console during downloading.
                This function just waits -- it won't bring you the menu :).
                """
                # Wait untill downloading starts
                while not os.path.exists(tmp_fasta):
                    if not stop_wait.is_set():
                        return
                    # end if
                    sleep(1)
                # end while

                MB_size = 1024**2  # we will divide by it to get megabytes

                while stop_wait.is_set():
                    # Get size of downloaded data
                    fsize = round(os.path.getsize(tmp_fasta) / MB_size,
                                  1)  # get megabytes
                    printn("\r{} - {} MB downloaded ".format(getwt(), fsize))
                    sleep(1)  # instant updates are not necessary
                # end while

                # Print total size of downloaded file (it can be deleted by this time)
                try:
                    fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1)
                except OSError:
                    # We can pass this ecxeption -- we do delete this file if downloading crushes
                    # And this function just waits :)
                    pass
                # end try
                printlog_info("\r{} - {} MB downloaded ".format(
                    getwt(), fsize))
예제 #2
0
def verify_taxids(taxid_list):
    # Funciton verifies TaxIDs passed to prober with `-g` option.
    # Function requests NCBI Taxonomy Browser and parses organism's name from HTML response.
    # What is more, this function configures `oraganisms` list - it will be included into BLAST submissions.
    #
    # :param taxid_list: list of TaxIDs. TaxIDs are strings, but they are verified to be integers
    #     during CL argument parsing;
    # :type taxid_list: list<str>;
    #
    # Returns list of strings of the following format: "<tax_name> (taxid:<TaxID>)>"

    organisms = list()
    if len(taxid_list) > 0:

        printlog_info("Verifying TaxIDs:")
        for taxid in taxid_list:
            printn("   {} - ".format(taxid))
            try:
                tax_resp = lingering_https_get_request(
                    "www.ncbi.nlm.nih.gov",
                    "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}".format(
                        taxid), "taxonomy")
                tax_name = re.search(r"Taxonomy browser \((.+?)\)",
                                     tax_resp).group(1)
            except AttributeError:
                printlog_error("\aError: TaxID not found")
                printlog_error(
                    "Please, check your TaxID: https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi"
                )
                platf_depend_exit(1)
            except OSError as oserr:
                printlog_error("Something is wrong with connection:")
                printlog_error(str(oserr))
                platf_depend_exit(-2)
            else:
                print(tax_name)
                log_info("{} - {}".format(taxid, tax_name))
                organisms.append("{} (taxid:{})".format(tax_name, taxid))
            # end try
        # end for
        print('-' * 30 + '\n')

    # end if
    return organisms
예제 #3
0
def launch_single_thread_binning(fpath_list, binning_func, tax_annot_res_dir,
                                 sens, min_qual, min_qlen, min_pident,
                                 min_coverage, no_trash):
    # Function launches single-thread binning, performed by finction 'srt_func'.
    #
    # :param fpath_list: list of path to files to process;
    # :type fpath_list: list<str>;
    # :param binning_func: function that performs binning;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    res_stats = list()
    num_files_total = len(fpath_list)

    # Sort files in single thread:
    for i, fq_fa_path in enumerate(fpath_list):
        res_stats.append(
            binning_func(fq_fa_path, tax_annot_res_dir, sens, min_qual,
                         min_qlen, min_pident, min_coverage, no_trash))
        sys.stdout.write('\r')
        printlog_info_time("File #{}/{} `{}` is binned."\
            .format(i+1, num_files_total, os.path.basename(fq_fa_path)))
        printn(" Working...")
    # end for

    return res_stats
예제 #4
0
def wait_for_align(rid, rtoe, pack_to_send, filename):
    # Function waits untill BLAST server accomplishes the request.
    #
    # :param rid: Request ID to wait for;
    # :type rid: str;
    # :param rtoe: time in seconds estimated by BLAST server needed to accomplish the request;
    # :type rtoe: int;
    # :param pack_to_send: current packet (id) number to send;
    # :type pack_to_send: int;
    # :param filename: basename of current FASTA file;
    # :type filename: str
    #
    # Returns XML response ('str').

    print()
    print("Requesting for current query status. Request ID: {}".format(rid))
    print(" `{}`; Submission #{}".format(filename, pack_to_send[0]))
    log_info("Requesting for current query status.")
    log_info("Request ID: {}; `{}`; Submission #{}".format(
        rid,
        filename,
        pack_to_send[0],
    ))
    # RTOE can be zero at the very beginning of resumption
    if rtoe > 0:

        printlog_info_time(
            "BLAST server estimates that alignment will be accomplished in {} seconds"
            .format(rtoe))
        printlog_info_time(
            "Waiting for {}+3 (+3 extra) seconds...".format(rtoe))
        # Server migth be wrong -- we will give it 3 extra seconds
        sleep(rtoe + 3)
        printlog_info_time(
            "{} seconds have passed. Checking if alignment is accomplished...".
            format(rtoe + 3))
    # end if

    server = "blast.ncbi.nlm.nih.gov"
    wait_url = "/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=" + rid

    whtspc_len = 6 + len("(requesting)")

    while True:
        resp_content = lingering_https_get_request(server, wait_url,
                                                   "BLAST response")

        # if server asks to wait
        if "Status=WAITING" in resp_content:
            printn("\r{} - The request is being processed. Waiting{}{}".format(
                getwt(), ' ' * whtspc_len, "\033[%dD" % whtspc_len))
            # indicate each 20 seconds with a dot
            for i in range(1, 7):
                sleep(10)
                printn(
                    "\r{} - The request is being processed. Waiting{}".format(
                        getwt(), '.' * i))
            # end for
            printn("(requesting)")
            continue
        elif "Status=FAILED" in resp_content:
            # if job failed
            print()
            printlog_info_time("Job failed\a\n")
            printlog_info("Resending this packet.")
            return None, BlastError(2)
        elif "Status=UNKNOWN" in resp_content:
            # if job expired
            print()
            printlog_info_time("Job expired\a\n")
            printlog_info("Resending this packet.")
            return None, BlastError(1)
        # if results are ready
        elif "Status=READY" in resp_content:
            print()
            printlog_info("Result for query `{}` #{} is ready!".format(
                filename, pack_to_send[0]))
            # if there are hits
            if "ThereAreHits=yes" in resp_content:
                for i in range(15, 0, -5):
                    print('-' * i)
                # end for
                print("-\nRetrieving results...")

                # Retrieve human-readable text and put it into result directory
                retrieve_text_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=Text&DESCRIPTIONS=1&ALIGNMENTS=1&RID=" + rid
                txt_align_res = lingering_https_get_request(
                    server, retrieve_text_url,
                    "text version of BLAST response")

                # Count already existing plain text files in outdir:
                is_txt_response = lambda f: not re.search(
                    r"prober_blast_response_[0-9]+\.txt", f) is None
                outdir_path = os.path.dirname(logging.getLoggerClass(
                ).root.handlers[0].baseFilename)  # tricky trick
                response_num = len(
                    tuple(filter(is_txt_response, os.listdir(outdir_path))))

                # Curent txt response file will have number `response_num+1`
                txt_hpath = os.path.join(
                    outdir_path,
                    "prober_blast_response_{}.txt".format(response_num + 1))
                # Write text result for a human to read
                with open(txt_hpath, 'w') as txt_file:
                    txt_file.write(txt_align_res)
                # end with
            elif "ThereAreHits=no" in resp_content:
                # if there are no hits
                printlog_info_time("There are no hits. It happens.\n")
            else:
                # probably, job is failed if execution reaches here
                print()
                printlog_info_time("Job failed\a\n")
                printlog_info("Resending this packet.")
                return None, BlastError(2)
            # end if
            break
        # end if
        # Execution should not reach here
        printlog_error_time(
            "Fatal error (-122). Please contact the developer.\a\n")
        platf_depend_exit(-122)
    # end while

    # Retrieve XML result
    retrieve_xml_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=XML&ALIGNMENTS=1&RID=" + rid
    xml_text = lingering_https_get_request(server, retrieve_xml_url,
                                           "XML BLAST response")

    if "Bad Gateway" in xml_text:
        print()
        printlog_info_time("Bad Gateway. Data from last packet has been lost.")
        printlog_info("Resending this packet.")
        return None, BlastError(1)

    elif "Status=FAILED" in xml_text:
        print()
        printlog_info_time("BLAST error: request failed")
        printlog_info("Resending this packet.")
        return None, BlastError(2)

    elif "to start it again" in xml_text:
        print()
        printlog_info_time("BLAST error")
        printlog_info("Resending this packet.")
        return None, BlastError(2)

    elif "[blastsrv4.REAL]" in xml_text:
        blastsrv4_match = re.search(r"(\[blastsrv4\.REAL\].*\))", xml_text)
        blastsrv4_str = "" if blastsrv4_match is None else ": {}".format(
            blastsrv4_match.group(1))
        printlog_info_time("BLAST server error{}".format(blastsrv4_str))
        # Error code 2 indicated that we need to split packet and resubmit
        return None, BlastError(2)
    # end if

    return xml_text, BlastError(0)
예제 #5
0
def bin_fastqa_file(fq_fa_lst, tax_annot_res_dir, sens, n_thr, min_qual,
                    min_qlen, min_pident, min_coverage, no_trash):
    # Function for parallel binning FASTQ and FASTA files.
    # Actually bins multiple files.
    #
    # :param fq_fa_lst: lsit of paths to FASTQ (of FASTA) file meant to be processed;
    # :type fq_fa_lst: list<str>;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    for fq_fa_path in fq_fa_lst:

        new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir)
        tsv_res_fpath = get_res_tsv_fpath(new_dpath)
        taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy",
                                     "taxonomy.tsv")
        resfile_lines = configure_resfile_lines(tsv_res_fpath, sens,
                                                taxonomy_path)

        # Configure path to trash file
        if is_fastq(fq_fa_path):
            seq_records_generator = fastq_records
            write_fun = write_fastq_record
        else:
            seq_records_generator = fasta_records
            write_fun = write_fasta_record
        # end if

        # Make filter for quality and length
        QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen)
        # Configure path to trash file
        if not no_trash:
            QL_trash_fpath = get_QL_trash_fpath(
                fq_fa_path,
                outdir_path,
                min_qual,
                min_qlen,
            )
        else:
            QL_trash_fpath = None
        # end if

        # Make filter for identity and coverage
        align_filter = get_align_filter(min_pident, min_coverage)
        # Configure path to this trash file
        if not no_trash:
            align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path,
                                                      min_pident, min_coverage)
        else:
            align_trash_fpath = None
        # end if

        # Create an iterator that will yield records
        seq_records_iterator = iter(seq_records_generator(fq_fa_path))
        # Dict for storing batches of sequences meant to be written to output files:
        to_write = dict()
        stop = False  # for outer while-loop

        while not stop:

            # Extract batch of records of 'n_thr' size and find their destination paths:
            for _ in range(n_thr):

                try:
                    fastqa_rec = next(seq_records_iterator)
                except StopIteration:
                    stop = True  # for outer while-loop
                    break
                # end try

                read_name = sys.intern(fmt_read_id(
                    fastqa_rec["seq_id"])[1:])  # get ID of the sequence

                try:
                    hit_names, *vals_to_filter = resfile_lines[
                        read_name]  # find hit corresponding to this sequence
                except KeyError:
                    printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\
                        .format(read_name))
                    printlog_error("This TSV file: `{}`".format(tsv_res_fpath))
                    printlog_error(
                        "Make sure that this read has been already processed by \
`barapost-prober.py` and `barapost-local.py`.")
                    platf_depend_exit(1)
                # end try

                # If read is found in TSV file:
                if not QL_filter(vals_to_filter):
                    # Place this sequence to QL trash file
                    to_write[read_name] = (fastqa_rec, QL_trash_fpath)
                    QL_seqs_fail += 1
                elif not align_filter(vals_to_filter):
                    # Place this sequence to QL trash file
                    to_write[read_name] = (fastqa_rec, align_trash_fpath)
                    align_seqs_fail += 1
                else:
                    for hit_name in hit_names.split("&&"):
                        # Get name of result FASTQ file to write this read in
                        binned_file_path = os.path.join(
                            outdir_path, "{}.fast{}".format(
                                hit_name,
                                'q' if is_fastq(fq_fa_path) else 'a'))
                        to_write[read_name] = (fastqa_rec, binned_file_path)
                    # end for
                    seqs_pass += 1
                # end if
            # end for

            # Write batch of records to output files:
            with write_lock:
                for record, fpath in to_write.values():
                    write_fun(fpath, record)
                # end for
            # end with
            to_write.clear()
        # end while

        with write_lock:
            # Write the rest of 'uneven' data to output files:
            if len(to_write) != 0:
                for record, fpath in to_write.values():
                    write_fun(fpath, record)
                # end for
            # end if
            sys.stdout.write('\r')
            printlog_info_time("File `{}` is binned.".format(
                os.path.basename(fq_fa_path)))
            printn(" Working...")
        # end with
    # end for

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)
예제 #6
0
def parse_align_results_xml(xml_text, qual_dict, acc_dict, taxonomy_path):
    # Function parses BLAST xml response and returns tsv lines containing gathered information:
    #   1. Query name.
    #   2. Hit name formatted by 'format_taxonomy_name()' function.
    #   3. Hit accession.
    #   4. Length of query sequence.
    #   5. Length of alignment.
    #   6. Percent of identity.
    #   7. Percent of gaps.
    #   8. E-value.
    #   9. Average quality of a read (if source file is FASTQ).
    #   10. Read accuracy (%) (if source file is FASTQ).
    #
    # :param xml_text: XML text with results of alignment;
    # :type xml_text: str;
    # :param qual_dict: dict, which maps sequence IDs to their quality;
    # :type qual_dict: dict<str: float>;
    # :param acc_dict: dictionary comntaining accession data of hits;
    # :type acc_dict: dict<str: tuple<str, str, int>>;
    # :param taxonomy_path: path to DBM file with taxonomy;
    # :type taxonomy_path: str;
    #
    # Returns list<str>.

    result_tsv_lines = list()

    # /=== Parse BLAST XML response ===/

    root = ElementTree.fromstring(xml_text)  # get tree instance

    # Iterate over "Iteration" and "Iteration_hits" nodes
    for iter_elem, iter_hit in zip(root.iter("Iteration"),
                                   root.iter("Iteration_hits")):
        # "Iteration" node contains query name information
        query_name = sys.intern(iter_elem.find("Iteration_query-def").text)
        query_len = iter_elem.find("Iteration_query-len").text

        avg_quality = qual_dict[query_name]
        if avg_quality != '-':
            miscall_prop = round(10**(avg_quality / -10), 3)
            accuracy = round(100 * (1 - miscall_prop),
                             2)  # expected percent of correctly called bases
            qual_info_to_print = "  Average quality of this read is {}, i.e. accuracy is {}%;\n".format(
                avg_quality, accuracy)
        else:
            # If FASTA file is processing, print dashed in quality columns
            avg_quality = "-"
            accuracy = "-"  # expected percent of correctly called bases
            qual_info_to_print = ""
        # end if

        # Check if there are any hits
        chck_h = iter_hit.find("Hit")

        if chck_h is None:
            # If there is no hit for current sequence
            print(
                "\n{} -- No significant similarity found;\n    Query length - {};"
                .format(query_name, query_len))
            result_tsv_lines.append('\t'.join(
                (query_name, "No significant similarity found", "-", query_len,
                 "-", "-", "-", "-", str(avg_quality), str(accuracy))))
        else:
            # If there are any hits, node "Iteration_hits" contains at least one "Hit" child
            # Get first-best bitscore and iterato over hits that have the save (i.e. the highest bitscore):
            top_bitscore = next(
                chck_h.find("Hit_hsps").iter("Hsp")).find("Hsp_bit-score").text

            annotations = list()
            hit_accs = list()

            for hit in iter_hit:

                # Find the first HSP
                hsp = next(hit.find("Hit_hsps").iter("Hsp"))

                if hsp.find("Hsp_bit-score").text != top_bitscore:
                    break
                # end if

                # Get full hit name (e.g. "Erwinia amylovora strain S59/5, complete genome")
                hit_def = remove_bad_chars(hit.find("Hit_def").text)
                annotations.append(hit_def)

                curr_acc = sys.intern(hit.find("Hit_accession").text)
                hit_accs.append(curr_acc)  # get hit accession

                # Get taxonomy
                find_taxonomy(curr_acc, hit_def, taxonomy_path)

                # Update accession dictionary
                try:
                    acc_dict[curr_acc][1] += 1
                except KeyError:
                    acc_dict[curr_acc] = [hit_def, 1]
                # end try

                align_len = hsp.find("Hsp_align-len").text.strip()
                pident = hsp.find(
                    "Hsp_identity").text  # get number of matched nucleotides
                gaps = hsp.find("Hsp_gaps").text  # get number of gaps

                evalue = hsp.find("Hsp_evalue").text  # get e-value
                pident_ratio = round(float(pident) / int(align_len) * 100, 2)
                gaps_ratio = round(float(gaps) / int(align_len) * 100, 2)
            # end for

            # Divide annotations and accessions with '&&'
            annotations = '&&'.join(annotations)
            hit_accs = '&&'.join(hit_accs)

            print("""\n{} - {}
  Query length - {} nt;
  Identity - {}/{} ({}%); Gaps - {}/{} ({}%);""".format(
                query_name, annotations, query_len, pident, align_len,
                pident_ratio, gaps, align_len, gaps_ratio))

            # Append new tsv line containing recently collected information
            result_tsv_lines.append('\t'.join(
                (query_name, annotations, hit_accs, query_len, align_len,
                 pident, gaps, evalue, str(avg_quality), str(accuracy))))

        # end if
        printn(qual_info_to_print)
    # end for

    return result_tsv_lines
예제 #7
0
def _reformat_legacy_file(legacy_tax_path):

    import shelve

    # Check if this file is corrupted
    try:
        with shelve.open(legacy_tax_path, 'r') as tax_file:
            pass
        # end with
    except OSError as err:
        printlog_error("Legacy taxonomy file appears to be corrupted.")
        printlog_error("This error might be fatal.")
        str_err = str(err)
        if "dbm.gnu" in str_err and "module is not" in str_err:
            printlog_error("Installing `python3-gdbm` might solve this problem.")
        else:
            printlog_error("The program can't recover taxonomy from the broken file.")
            printlog_error("Seems, you have to annotate your sequences again.")
            printlog_error("Sorry for that :(")
        # end if
        platf_depend_exit(1)
    # end try

    new_tax_path = "{}.tsv".format(legacy_tax_path)

    taxonomy.init_tax_file(new_tax_path)

    printn("Reformatting: `{}` ->".format(legacy_tax_path))
    log_info("Reformatting: `{}` ->".format(legacy_tax_path))

    with shelve.open(legacy_tax_path, 'r') as old_tax_file, open(new_tax_path, 'w') as new_tax_file:
        for acc, taxonomy_from_file in old_tax_file.items():
            if isinstance(taxonomy_from_file, tuple):
                tax_str = taxonomy.config_taxonomy_str(taxonomy_from_file)
                new_tax_file.write("{}\n".format('\t'.join( (acc, tax_str) )))
            elif isinstance(taxonomy_from_file, str):
                new_tax_file.write("{}\n".format('\t'.join( (acc, taxonomy_from_file) )))
            else:
                # Execution must not reach here
                printlog_error_time("Fatal error 8755.")
                printlog_error("Please, contact the developer.")
                platf_depend_exit(8755)
            # end if
        # end for
    # end with

    printlog_info(" `<same_dir>/{}`".format(os.path.basename(new_tax_path)))

    try:
        renamed_legacy_file = "{}_deprecated".format(legacy_tax_path)
        os.rename(legacy_tax_path, renamed_legacy_file)
    except OSError as err:
        printlog_error_time("Cannot rename legacy taxonomy file `{}`:".format(legacy_tax_path))
        printlog_error(str(err))
        printlog_error("But it's not a problem -- we will proceed with our work.")
    else:
        printlog_info("Renamed: `{}` -> `<same_dir>/{}`".format(legacy_tax_path,
            os.path.basename(renamed_legacy_file)))
    # end try

    printlog_info("Legacy taxonomy file is reformatted to TSV format.")
예제 #8
0
def process_paral(fq_fa_list, packet_size, tax_annot_res_dir, blast_algorithm,
                  use_index, db_path, nfiles):
    # Function performs 'many_files'-parallel mode of barapost-local.py.

    # :param fq_fa_list: list of paths to FASTA and FASTQ files meant to be processed;
    # :type fq_fa_list: list<str>;
    # :param packet_size: number of sequences processed by blast in a single launching;
    # :type packet_size: int;
    # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param blast_algorithm: blast algorithm to use;
    # :type blast_algorithm: str;
    # :param use_index: logic value indicationg whether to use indes;
    # :type use_index: bool;
    # :param db_path: path to database;
    # :type db_path: str;
    # :param nfiles: total number of files;
    # :type nfiles: int;

    queries_tmp_dir = os.path.join(tax_annot_res_dir, "queries-tmp")

    # Iterate over source FASTQ and FASTA files
    for fq_fa_path in fq_fa_list:

        # Create the result directory with the name of FASTQ of FASTA file being processed:
        new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir)

        # "hname" means human readable name (i.e. without file path and extention)
        infile_hname = os.path.basename(fq_fa_path)
        infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$",
                                 infile_hname).group(1)

        # Look around and ckeck if there are results of previous runs of this script
        # If 'look_around' is None -- there is no data from previous run
        previous_data = look_around(new_dpath, fq_fa_path)

        if previous_data is None:  # If there is no data from previous run
            num_done_seqs = 0  # number of successfully processed sequences
            tsv_res_path = os.path.join(
                new_dpath, "classification.tsv")  # form result tsv file path
        else:  # if there is data from previous run
            num_done_seqs = previous_data[
                "n_done_reads"]  # get number of successfully processed sequences
            tsv_res_path = previous_data[
                "tsv_respath"]  # result tsv file sholud be the same as during previous run
        # end if

        how_to_open = OPEN_FUNCS[is_gzipped(fq_fa_path)]
        fmt_func = FORMATTING_FUNCS[is_gzipped(fq_fa_path)]

        if is_fastq(fq_fa_path):
            packet_generator = fastq_packets
            num_seqs = sum(
                1
                for line in how_to_open(fq_fa_path)) // 4  # 4 lines per record
        else:
            packet_generator = fasta_packets
            try:
                num_seqs = len(
                    tuple(
                        filter(
                            lambda l: True if l.startswith('>') else False,
                            map(fmt_func,
                                how_to_open(fq_fa_path).readlines()))))
            except UnicodeDecodeError as err:
                with print_lock:
                    print()
                    printlog_warning("Warning: current file is broken: {}."\
                        .format(str(err)))
                    printlog_warning("File: `{}`".format(
                        os.path.abspath(fq_fa_path)))
                    printlog_warning("This file will not be processed.")
                    continue
                # end with
            # end try
        # end if

        if num_seqs == num_done_seqs:
            with counter_lock:
                file_counter.value += 1
                i = file_counter.value  # save to local var and release lock
            # end with
            with print_lock:
                sys.stdout.write('\r')
                printlog_info_time("File #{}/{} (`{}`) has been already completely processed.".\
                    format(i, nfiles, fq_fa_path))
                printlog_info("Omitting it.")
                printn("Working...")
            # end with
            continue
        # end if

        for packet in packet_generator(fq_fa_path, packet_size, num_done_seqs):

            # Blast the packet
            align_xml_text = launch_blastn(packet["fasta"], blast_algorithm,
                                           use_index, queries_tmp_dir, db_path)

            # Cnfigure result TSV lines
            result_tsv_lines = parse_align_results_xml(align_xml_text,
                                                       packet["qual"])

            # Write the result to tsv
            write_classification(result_tsv_lines, tsv_res_path)
        # end for

        with counter_lock:
            file_counter.value += 1
            i = file_counter.value  # save to local var and release lock
        # end with
        with print_lock:
            sys.stdout.write('\r')
            printlog_info_time("File #{}/{} (`{}`) is processed.".\
                format(i, nfiles, os.path.basename(fq_fa_path)))
            printn("Working...")
        # end with
    # end for

    query_fpath = os.path.join(queries_tmp_dir,
                               "query{}_tmp.fasta".format(os.getpid()))
    remove_tmp_files(query_fpath)
예제 #9
0
def map_f5reads_2_taxann(f5_fpaths, tsv_taxann_lst, tax_annot_res_dir):
    # Function perform mapping of all reads stored in input FAST5 files
    #     to existing TSV files containing taxonomic annotation info.
    #
    # It creates an DBM index file.
    #
    # :param f5_fpaths: list of paths to current FAST5 file;
    # :type f5_fpaths: list<str>;
    # :param tsv_taxann_lst: list of path to TSV files that contain taxonomic annotation;
    # :type tsv_taxann_lst: list<str>;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;

    index_dirpath = os.path.join(
        tax_annot_res_dir,
        index_name)  # name of directory that will contain indicies

    for f5_path in f5_fpaths:
        # File validation:
        #   RuntimeError will be raised if FAST5 file is broken.
        try:
            # File existance checking is performed while parsing CL arguments.
            # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file.
            if not h5py.is_hdf5(f5_path):
                raise RuntimeError(
                    "file is not of HDF5 (i.e. not FAST5) format")
            # end if

            f5_file = h5py.File(f5_path, 'r')

            for _ in f5_file:
                break
            # end for
        except RuntimeError as runterr:
            with print_lock:
                printlog_error_time("Error: FAST5 file is broken")
                printlog_error("Reading the file `{}` failed.".format(
                    os.path.basename(f5_path)))
                printlog_error("Reason: {}".format(str(runterr)))
                printlog_error("Omitting this file...")
                print()
            # end with
            return
        # end try

        readids_to_seek = list(fast5_readids(f5_file))
        idx_dict = dict()  # dictionary for index

        # This saving is needed to compare with 'len(readids_to_seek)'
        #    after all TSV will be looked through in order to
        #    determine if some reads miss taxonomic annotation.
        len_before = len(readids_to_seek)

        # Iterate over TSV-taxann file
        for tsv_taxann_fpath in tsv_taxann_lst:

            with open(tsv_taxann_fpath, 'r') as taxann_file:

                # Get all read IDs in current TSV
                readids_in_tsv = list(
                    map(lambda l: l.split('\t')[0], taxann_file.readlines()))

                # Iterate over all other reads in current FAST5
                #    ('reversed' is necessary because we remove items from list in this loop)
                for readid in reversed(readids_to_seek):
                    fmt_id = fmt_read_id(readid)[1:]
                    if fmt_id in readids_in_tsv:
                        # If not first -- write data to dict (and to index later)
                        try:
                            idx_dict[tsv_taxann_fpath].append(
                                "read_" + fmt_id)  # append to existing list
                        except KeyError:
                            idx_dict[tsv_taxann_fpath] = [
                                "read_" + fmt_id
                            ]  # create a new list
                        finally:
                            readids_to_seek.remove(readid)
                        # end try
                    # end if
                # end for
            # end with
            if len(readids_to_seek) == 0:
                break
            # end if
        # end for

        # Save info about reads, for which classification if not found
        #   in any of classification files
        if len(readids_to_seek) != 0:
            not_fount_key = 'CLASSIF_NOT_FOUND'
            idx_dict[not_fount_key] = list()
            for readid in readids_to_seek:
                fmt_id = fmt_read_id(readid)[1:]
                idx_dict[not_fount_key].append("read_" + fmt_id)
            # end for
        # end if

        # If after all TSV is checked but nothing have changed -- we miss taxonomic annotation
        #     for some reads! And we will write their IDs to 'missing_reads_lst.txt' file.
        if len(readids_to_seek) == len_before:
            with print_lock:
                printlog_error_time(
                    "Error: some reads from FAST5 file not found")
                printlog_error("This FAST5 file: `{}`".format(f5_path))
                printlog_error(
                    "Some reads have not undergone taxonomic annotation.")
                missing_log = "missing_reads_lst.txt"
                printlog_error(
                    "List of missing reads are in following file: `{}`".format(
                        missing_log))
                with open(missing_log, 'w') as missing_logfile:
                    missing_logfile.write(
                        "Missing reads from file `{}`:\n\n".format(f5_path))
                    for readid in readids_to_seek:
                        missing_logfile.write(fmt_read_id(readid) + '\n')
                    # end for
                try:
                    for path in glob(os.path.join(index_dirpath, '*')):
                        os.unlink(path)
                    # end for
                    os.rmdir(index_dirpath)
                except OSError as oserr:
                    printlog_error_time(
                        "Error occured while removing index directory: {}".
                        format(oserr))
                finally:
                    platf_depend_exit(3)
                # end try
            # end with
        # end if

        with write_lock:
            try:
                # Open index files appending to existing data ('c' parameter)
                with open_shelve(os.path.join(index_dirpath, index_name),
                                 'c') as index_f5_2_tsv:
                    # Update index
                    index_f5_2_tsv[f5_path] = idx_dict
                # end with
            except OSError as oserr:
                printlog_error_time(
                    "Error: cannot create index file `{}`".format(
                        os.path.join(index_dirpath, index_name)))
                printlog_error(str(oserr))
                platf_depend_exit(1)
            # end try
        # end with

        sys.stdout.write('\r')
        printlog_info_time("File `{}` is processed.".format(
            os.path.basename(f5_path)))
        printn(" Working...")
예제 #10
0
        "\nWarning! Binning FAST5 files in parallel doesn't give any profit.")
    print("Number of threads is switched to 1.")
    n_thr = 1
# end if
if len(fast5_list) == 0 and untwist_fast5:
    print(
        "\nWarning! No FAST5 file has been given to barapost-binning's input.")
    print("Therefore, `-u` (`--untwist-fast5`) flag does not make any sense.")
    print("Ignoring it.\n")
    untwist_fast5 = False
# end if

# Make sure that each file meant to be processed has it's directory with TSV result file
#    generated by prober and barapost.

printn("Primary validation...")
if not untwist_fast5:
    for fpath in fast5_list:
        # Get number of directories in 'tax_annot_res_dir' where results of current FAST5
        #    baraposting are located.
        possible_fast5_resdirs_num = len(
            glob("{}{}*{}*".format(tax_annot_res_dir, os.sep,
                                   get_checkstr(fpath))))

        if possible_fast5_resdirs_num == 1:
            continue  # OK
        elif possible_fast5_resdirs_num == 0:  # there is no such a directory
            print()
            printlog_error_time(
                "Error: classification for following FAST5 file is missing:")
            printlog_error("  `{}`".format(fpath))
예제 #11
0
def process(fq_fa_list, n_thr, packet_size, tax_annot_res_dir,
            blast_algorithm, use_index, db_path):
    # Function preforms "few_files"-parallel mode.
    #
    # :param fq_fa_list: list of paths to files meant to be processed;
    # :type fq_fa_list: list<str>;
    # :param n_thr: number of threads to launch;
    # :type n_thr: int;
    # :param packet_size: number of sequences processed by blast in a single launching;
    # :type packet_size: int;
    # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param blast_algorithm: blast algorithm to use;
    # :type blast_algorithm: str;
    # :param use_index: logic value indicationg whether to use indes;
    # :type use_index: bool;
    # :param db_path: path to database;
    # :type db_path: str;

    nfiles = len(fq_fa_list)

    for i, fq_fa_path in enumerate(fq_fa_list):
        # Create the result directory with the name of FASTQ of FASTA file being processed:
        new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir)

        # "hname" means human readable name (i.e. without file path and extention)
        infile_hname = os.path.basename(fq_fa_path)
        infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1)

        # Look around and ckeck if there are results of previous runs of this script
        # If 'look_around' is None -- there is no data from previous run
        previous_data = look_around(new_dpath, fq_fa_path)

        if previous_data is None: # If there is no data from previous run
            num_done_seqs = 0 # number of successfully processed sequences
            tsv_res_path = os.path.join(new_dpath, "classification.tsv") # form result tsv file path
        else: # if there is data from previous run
            num_done_seqs = previous_data["n_done_reads"] # get number of successfully processed sequences
            tsv_res_path = previous_data["tsv_respath"] # result tsv file sholud be the same as during previous run
        # end if

        how_to_open = OPEN_FUNCS[ is_gzipped(fq_fa_path) ]
        fmt_func = FORMATTING_FUNCS[ is_gzipped(fq_fa_path) ]

        if is_fastq(fq_fa_path):
            packet_generator = fastq_packets
            num_seqs = sum(1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record
        else:
            packet_generator = fasta_packets
            try:
                num_seqs = len(tuple(filter(lambda l: True if l.startswith('>') else False,
                    map(fmt_func, how_to_open(fq_fa_path).readlines()))))
            except UnicodeDecodeError as err:
                print()
                printlog_warning("Warning: current file is broken: {}."\
                    .format(str(err)))
                printlog_warning("File: `{}`".format(os.path.abspath(fq_fa_path)))
                printlog_warning("This file will not be processed.")
                continue
            # end try
        # end if

        packet_size = min(packet_size, num_seqs // n_thr)

        if num_seqs == num_done_seqs:
            sys.stdout.write('\r')
            printlog_info_time("File #{}/{} (`{}`) has been already completely processed."\
                .format(i+1, nfiles, fq_fa_path))
            printlog_info("Omitting it.")
            printn("Working...")
            return
        # end if

        # Get number of seqeunces to pass to each thread
        file_part_size = num_seqs // n_thr
        if num_seqs % n_thr != 0:
            file_part_size += 1
        # end if

        pool = mp.Pool(n_thr, initializer=init_proc_single_file_in_paral,
            initargs=(mp.Lock(), mp.Lock(),))

        pool.starmap(process_part_of_file, [(file_part,
            tsv_res_path,
            packet_size,
            tax_annot_res_dir,
            blast_algorithm,
            use_index,
            db_path) for file_part in packet_generator(fq_fa_path,
                file_part_size,
                num_done_seqs)])

        # Reaping zombies
        pool.close()
        pool.join()

        sys.stdout.write('\r')
        printlog_info_time("File #{}/{} (`{}`) is processed.".\
            format(i+1, nfiles, os.path.basename(fq_fa_path)))
        printn("Working...")    # end for
def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen,
                   min_pident, min_coverage, no_trash):
    # Function bins FAST5 file with untwisting.
    #
    # :param f5_path: path to FAST5 file meant to be processed;
    # :type f5_path: str;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    srt_file_dict = dict()

    index_dirpath = os.path.join(
        tax_annot_res_dir,
        index_name)  # name of directory that will contain indicies

    # Make filter for quality and length
    QL_filter = get_QL_filter(f5_path, min_qual, min_qlen)
    # Configure path to trash file
    if not no_trash:
        QL_trash_fpath = get_QL_trash_fpath(
            f5_path,
            outdir_path,
            min_qual,
            min_qlen,
        )
    else:
        QL_trash_fpath = None
    # end if

    # Make filter for identity and coverage
    align_filter = get_align_filter(min_pident, min_coverage)
    # Configure path to this trash file
    if not no_trash:
        align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path,
                                                  min_pident, min_coverage)
    else:
        align_trash_fpath = None
    # end if

    # File validation:
    #   RuntimeError will be raised if FAST5 file is broken.
    try:
        # File existance checking is performed while parsing CL arguments.
        # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file.
        if not h5py.is_hdf5(f5_path):
            raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format")
        # end if

        from_f5 = h5py.File(f5_path, 'r')

        for _ in from_f5:
            break
        # end for
    except RuntimeError as runterr:
        printlog_error_time("FAST5 file is broken")
        printlog_error("Reading the file `{}` crashed.".format(
            os.path.basename(f5_path)))
        printlog_error("Reason: {}".format(str(runterr)))
        printlog_error("Omitting this file...")
        print()
        # Return zeroes -- inc_val won't be incremented and this file will be omitted
        return (0, 0, 0)
    # end try

    # singleFAST5 and multiFAST5 files should be processed in different ways
    # "Raw" group always in singleFAST5 root and never in multiFAST5 root
    if "Raw" in from_f5.keys():
        f5_cpy_func = copy_single_f5
    else:
        f5_cpy_func = copy_read_f5_2_f5
    # end if

    readids_to_seek = list(from_f5.keys())  # list of not-binned-yet read IDs

    # Fill the list 'readids_to_seek'
    for read_name in fast5_readids(from_f5):
        # Get rid of "read_"
        readids_to_seek.append(sys.intern(read_name))
    # end for

    # Walk through the index
    index_f5_2_tsv = open_shelve(os.path.join(index_dirpath, index_name), 'r')

    if not f5_path in index_f5_2_tsv.keys():
        printlog_error_time(
            "Source FAST5 file `{}` not found in index".format(f5_path))
        printlog_error("Try to rebuild index")
        platf_depend_exit(1)
    # end if

    for tsv_path in index_f5_2_tsv[f5_path].keys():

        read_names = index_f5_2_tsv[f5_path][tsv_path]
        taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy",
                                     "taxonomy.tsv")
        resfile_lines = configure_resfile_lines(tsv_path, sens, taxonomy_path)

        for read_name in read_names:
            try:
                hit_names, *vals_to_filter = resfile_lines[sys.intern(
                    fmt_read_id(read_name)[1:])]
            except KeyError:
                printlog_error_time("Error: missing taxonomic annotation info for read `{}`"\
                    .format(fmt_read_id(read_name)[1:]))
                printlog_error(
                    "It is stored in `{}` FAST5 file".format(f5_path))
                printlog_error(
                    "Try to make new index file (press ENTER on corresponding prompt)."
                )
                printlog_error(
                    "Or, if does not work for you, make sure that taxonomic annotation info \
for this read is present in one of TSV files generated by `barapost-prober.py` and `barapost-local.py`."
                )
                index_f5_2_tsv.close()
                platf_depend_exit(1)
            # end try

            if not QL_filter(vals_to_filter):
                # Get name of result FASTQ file to write this read in
                if QL_trash_fpath not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict,
                                                     QL_trash_fpath)
                # end if
                f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath])
                QL_seqs_fail += 1
            elif not align_filter(vals_to_filter):
                # Get name of result FASTQ file to write this read in
                if align_trash_fpath not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict,
                                                     align_trash_fpath)
                # end if
                f5_cpy_func(from_f5, read_name,
                            srt_file_dict[align_trash_fpath])
                align_seqs_fail += 1
            else:
                for hit_name in hit_names.split(
                        "&&"
                ):  # there can be multiple hits for single query sequence
                    # Get name of result FASTQ file to write this read in
                    binned_file_path = os.path.join(
                        outdir_path, "{}.fast5".format(hit_name))
                    if binned_file_path not in srt_file_dict.keys():
                        srt_file_dict = update_file_dict(
                            srt_file_dict, binned_file_path)
                    # end if
                    f5_cpy_func(from_f5, read_name,
                                srt_file_dict[binned_file_path])
                # end for
                seqs_pass += 1
            # end if
        # end for

    from_f5.close()
    index_f5_2_tsv.close()

    # Close all binned files
    for file_obj in filter(lambda x: not x is None, srt_file_dict.values()):
        file_obj.close()
    # end for

    sys.stdout.write('\r')
    printlog_info_time("File `{}` is binned.".format(
        os.path.basename(f5_path)))
    printn(" Working...")

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)
예제 #13
0
def bin_fastqa_file(fq_fa_path, tax_annot_res_dir, sens, min_qual, min_qlen,
                    min_pident, min_coverage, no_trash):
    # Function for single-thread binning FASTQ and FASTA files.
    #
    # :param fq_fa_path: path to FASTQ (of FASTA) file meant to be processed;
    # :type fq_fa_path: str;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    srt_file_dict = dict(
    )  # dict containing file objects of existing output files

    new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir)
    tsv_res_fpath = get_res_tsv_fpath(new_dpath)
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")
    resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path)

    # Configure generator, write function and path to trash file
    if is_fastq(fq_fa_path):
        seq_records_generator = fastq_records
        write_fun = write_fastq_record
    else:
        seq_records_generator = fasta_records
        write_fun = write_fasta_record
    # end if

    # Make filter for quality and length
    QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen)
    # Configure path to trash file
    if not no_trash:
        QL_trash_fpath = get_QL_trash_fpath(
            fq_fa_path,
            outdir_path,
            min_qual,
            min_qlen,
        )
    else:
        QL_trash_fpath = None
    # end if

    # Make filter for identity and coverage
    align_filter = get_align_filter(min_pident, min_coverage)
    # Configure path to this trash file
    if not no_trash:
        align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path,
                                                  min_pident, min_coverage)
    else:
        align_trash_fpath = None
    # end if

    for fastq_rec in seq_records_generator(fq_fa_path):

        read_name = sys.intern(fmt_read_id(
            fastq_rec["seq_id"])[1:])  # get ID of the sequence

        try:
            hit_names, *vals_to_filter = resfile_lines[
                read_name]  # find hit corresponding to this sequence
        except KeyError:
            printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\
                .format(read_name))
            printlog_error("This TSV file: `{}`".format(tsv_res_fpath))
            printlog_error("Make sure that this read has been already \
                processed by `barapost-prober.py` and `barapost-local.py`.")
            platf_depend_exit(1)
        # end try

        # Apply filters
        if not QL_filter(vals_to_filter):
            QL_seqs_fail += 1
            # Place this sequence to QL trash file
            if QL_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath)
            # end if
            write_fun(srt_file_dict[QL_trash_fpath],
                      fastq_rec)  # write current read to binned file

        elif not align_filter(vals_to_filter):
            align_seqs_fail += 1
            # Place this sequence to align_trash file
            if align_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict,
                                                 align_trash_fpath)
            # end if
            write_fun(srt_file_dict[align_trash_fpath],
                      fastq_rec)  # write current read to binned file

        else:
            for hit_name in hit_names.split(
                    "&&"
            ):  # there can be multiple hits for single query sequence
                # Get name of result FASTQ file to write this read in
                binned_file_path = os.path.join(
                    outdir_path,
                    "{}.fast{}".format(hit_name,
                                       'q' if is_fastq(fq_fa_path) else 'a'))
                if binned_file_path not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict,
                                                     binned_file_path)
                # end if
                write_fun(srt_file_dict[binned_file_path],
                          fastq_rec)  # write current read to binned file
            # end for
            seqs_pass += 1
        # end if
    # end for

    # Close all binned files
    for file_obj in filter(lambda x: not x is None, srt_file_dict.values()):
        file_obj.close()
    # end for

    sys.stdout.write('\r')
    printlog_info_time("File `{}` is binned.".format(
        os.path.basename(fq_fa_path)))
    printn(" Working...")

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)
예제 #14
0
# The main goal of multiprocessing is to isolate processes from one another.
#
# Two situations are available:
#   1. Number of threads <= number of files meant to be processed ('many_files'-parallel mode):
#      Files will be distribured equally among processes.
#      Processes interact with one another only while printing things to the console
#      for user's entertainment.
#   2. Number of threads > number of files meant to be processed ('few_files'-parallel mode):
#      Files will be processed one by one. They will be divided into equal blocks,
#      and these blocks will be distributed among processes.
#      Processes interact with one another while writing to result file and
#      while printing things to the console.

print()
printlog_info_time("Starting classification.")
printn("  Working...")

if n_thr <= len(fq_fa_list):
    if n_thr != 1:

        # Proceed 'many_files'-parallel processing

        from src.barapost_local_modules.parallel_mult_files import process

        process(fq_fa_list, n_thr, packet_size, tax_annot_res_dir,
                blast_algorithm, use_index, db_path)

    else:

        # Proceed single-thread processing
예제 #15
0
def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen,
    min_pident, min_coverage, no_trash):
    # Function bins FAST5 file without untwisting.
    #
    # :param f5_path: path to FAST5 file meant to be processed;
    # :type f5_path: str;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0 # counter for sequences, which pass filters
    QL_seqs_fail = 0 # counter for too short or too low-quality sequences
    align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage

    srt_file_dict = dict()

    new_dpath = glob("{}{}*{}*".format(tax_annot_res_dir, os.sep, get_checkstr(f5_path)))[0]
    tsv_res_fpath = get_res_tsv_fpath(new_dpath)
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")
    resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path)

    # Make filter for quality and length
    QL_filter = get_QL_filter(f5_path, min_qual, min_qlen)
    # Configure path to trash file
    if not no_trash:
        QL_trash_fpath = get_QL_trash_fpath(f5_path, outdir_path, min_qual, min_qlen,)
    else:
        QL_trash_fpath = None
    # end if

    # Make filter for identity and coverage
    align_filter = get_align_filter(min_pident, min_coverage)
    # Configure path to this trash file
    if not no_trash:
        align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path, min_pident, min_coverage)
    else:
        align_trash_fpath = None
    # end if

    # File validation:
    #   RuntimeError will be raised if FAST5 file is broken.
    try:
        # File existance checking is performed while parsing CL arguments.
        # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file.
        if not h5py.is_hdf5(f5_path):
            raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format")
        # end if

        from_f5 = h5py.File(f5_path, 'r')

        for _ in from_f5:
            break
        # end for
    except RuntimeError as runterr:
        printlog_error_time("FAST5 file is broken")
        printlog_error("Reading the file `{}` crashed.".format(os.path.basename(f5_path)))
        printlog_error("Reason: {}".format( str(runterr) ))
        printlog_error("Omitting this file...")
        print()
        # Return zeroes -- inc_val won't be incremented and this file will be omitted
        return (0, 0, 0)
    # end try

    # singleFAST5 and multiFAST5 files should be processed in different ways
    # "Raw" group always in singleFAST5 root and never in multiFAST5 root
    if "Raw" in from_f5.keys():
        f5_cpy_func = copy_single_f5
    else:
        f5_cpy_func = copy_read_f5_2_f5
    # end if

    for _, read_name in enumerate(fast5_readids(from_f5)):

        try:
            hit_names, *vals_to_filter = resfile_lines[sys.intern(fmt_read_id(read_name))[1:]] # omit 'read_' in the beginning of FAST5 group's name
        except KeyError:
            printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\
                .format(fmt_read_id(read_name)))
            printlog_error("This TSV file: `{}`".format(tsv_res_fpath))
            printlog_error("Try running barapost-binning with `-u` (`--untwist-fast5`) flag.\n")
            platf_depend_exit(1)
        # end try
        # If read is found in TSV file:
        if not QL_filter(vals_to_filter):
            QL_seqs_fail += 1
            # Get name of result FASTQ file to write this read in
            if QL_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath)
            # end if
            f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath])
        elif not align_filter(vals_to_filter):
            align_seqs_fail += 1
            # Get name of result FASTQ file to write this read in
            if QL_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath)
            # end if
            f5_cpy_func(from_f5, read_name, srt_file_dict[align_trash_fpath])
        else:
            for hit_name in hit_names.split("&&"): # there can be multiple hits for single query sequence
                # Get name of result FASTQ file to write this read in
                binned_file_path = os.path.join(outdir_path, "{}.fast5".format(hit_name))
                if binned_file_path not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict, binned_file_path)
                # end if
                f5_cpy_func(from_f5, read_name, srt_file_dict[binned_file_path])
            # end for
            seqs_pass += 1
        # end if
    # end for

    from_f5.close()

    # Close all binned files
    for file_obj in filter(lambda x: not x is None, srt_file_dict.values()):
        file_obj.close()
    # end for


    sys.stdout.write('\r')
    printlog_info_time("File `{}` is binned.".format(os.path.basename(f5_path)))
    printn(" Working...")

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)
예제 #16
0
def build_local_db(tax_annot_res_dir, acc_fpath, your_own_fasta_lst,
                   accs_to_download, use_index):
    # Function creates a database with utilities from 'blast+' toolkit
    #     according to acc_dict and your_own_fasta_lst.
    #
    # :param tax_annot_res_dir: path to current result directory
    #   (each processed file has it's own result directory);
    # :type tax_annot_res_dir: str;
    # :param acc_fpath: path to file "hits_to_download.tsv";
    # :type acc_fpath: str;
    # :param your_own_fasta_lst: list of user's fasta files to be included in database;
    # :type your_own_fasta_lst: list<str>;
    # :param accs_to_download: list of accessions from command line ('-s');
    # :type accs_to_download: list<str>;
    # :param use_index: whether to use index;
    # :type use_index: str;

    # Returns path to created database.

    # Path to directory in which database will be placed
    db_dir = os.path.join(tax_annot_res_dir, "local_database")
    # Path to DBM taxonomy file
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")

    try:
        os.makedirs(db_dir)
    except OSError:
        #If this directory exists

        while True:
            if len(os.listdir(db_dir)) == 0:
                # If db directory is empty -- break and build a database
                break
            else:
                print()
                printlog_info("Database directory is not empty:")
                printlog_info("  `{}`".format(os.path.abspath(db_dir)))
                printlog_info("Here is it's content:")
                for i, fname in enumerate(os.listdir(os.path.abspath(db_dir))):
                    printlog_info(" {}. `{}`".format(i + 1, fname))
                # end for
                reply = input(
                    """\nPress ENTER to start classification using existing database.
Enter 'r' to remove all files in this directory and create the database from the beginning:>>"""
                )

                if reply == "":
                    # Do not build a database, just return path to it.
                    printlog_info("You have chosen to use extant database.")

                    # Return path to DB located in this directory
                    dbpath = next(iter(os.listdir(db_dir)))
                    dbpath = dbpath.partition(".fasta")[0] + dbpath.partition(
                        ".fasta")[1]  # remove all after '.fasta'

                    return os.path.join(db_dir, dbpath)

                elif reply == 'r':

                    printlog_info("You have chosen to rebuild the database.")
                    # Rename old classification files and write actual data to new one:
                    old_classif_dirs = filter(
                        lambda d: os.path.exists(
                            os.path.join(d, "classification.tsv")),
                        glob(os.path.join(tax_annot_res_dir, "*")))
                    old_classif_files = tuple(
                        map(lambda f: os.path.join(f, "classification.tsv"),
                            old_classif_dirs))

                    if len(old_classif_files) > 0:
                        print()
                        printlog_info("Renaming old classification files:")
                        for classif_file in old_classif_files:
                            rename_file_verbosely(classif_file)
                        # end for
                    # end if

                    # Empty database directory
                    for file in glob("{}{}*".format(db_dir, os.sep)):
                        os.unlink(file)
                    # end for

                    # Break from the loop in order to build a database
                    break
                else:
                    print("Invalid reply: `{}`\n".format(reply))
                    continue
                # end if
            # end if
        # end while
    # end try

    # It is a dictionary of accessions and record names.
    # Accessions are keys, record names are values.
    acc_dict = configure_acc_dict(acc_fpath, your_own_fasta_lst,
                                  accs_to_download)

    if len(accs_to_download) != 0:
        verify_cl_accessions(accs_to_download, acc_dict)
    # end if

    # Retrieve already existing taxonomy data from taxonomy file
    tax_exist_accs = taxonomy.get_tax_keys(taxonomy_path)

    # If accession file does not exist and execution has reached here -- everything is OK --
    #    we are building a database from user's files only.
    if len(acc_dict) != 0:
        print()

        print("""Following sequences (and all replicons related to them)
  will be downloaded from Genbank for further taxonomic classification
  on your local machine:\n""")
        printlog_info(
            "Following sequences (and all replicons related to them) \
will be downloaded from Genbank for further taxonomic classification \
on your local machine:")
        for i, acc in enumerate(acc_dict.keys()):
            printlog_info(" {}. {} - `{}`".format(i + 1, acc, acc_dict[acc]))
        # end for

        search_for_related_replicons(acc_dict)

        printlog_info_time("Completing taxonomy file...")
        for i, acc in enumerate(acc_dict.keys()):
            if not acc in tax_exist_accs:
                taxonomy.find_taxonomy(acc, acc_dict[acc][1], taxonomy_path)
            # end if
            # Accessions can be of different length
            printn("\r{} - {}: {}/{}".format(getwt(), acc, i +
                                             1, len(acc_dict)) + " " * 10 +
                   "\b" * 10)
        # end for
        print()
        printlog_info_time("Taxonomy file is consistent.")
    # end if

    local_fasta = os.path.join(
        db_dir, "local_seq_set.fasta")  # path to downloaded FASTA file

    add_lambda_phage(local_fasta,
                     taxonomy_path)  # add lambda phage control sequence

    retrieve_fastas_by_acc(
        acc_dict, db_dir, local_fasta)  # download main fasta data from GenBank

    # Add 'your own' fasta files to database
    if not len(your_own_fasta_lst) == 0:

        # This variable counts sequences from local files.
        # It is necessary for not allowing duplicated accessions.
        own_seq_counter = 0

        # Check if these files are assembly made by SPAdes or a5
        spades_patt = r">NODE_[0-9]+"  # this pattern will match sequence IDs generated y SPAdes
        a5_patt = r">scaffold_[0-9]+"  # this pattern will match sequence IDs generated y a5
        assemblies = list(
        )  # this list will contain paths to assembly files (SPAdes or a5)

        for own_fasta_path in reversed(your_own_fasta_lst):

            how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)]
            fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)]

            with how_to_open(own_fasta_path) as fasta_file:
                first_seq_id = fmt_func(fasta_file.readline(
                ))  # get the first line in file (the first seq ID)
            # end with

            # if we've got SPAdes assembly
            if not re.search(spades_patt, first_seq_id) is None:
                assemblies.append(own_fasta_path)
                # Remove these file from list -- they will be processed in a specific way
                your_own_fasta_lst.remove(own_fasta_path)
                continue
            # end if

            # if we've got a5 assembly
            if not re.search(a5_patt, first_seq_id) is None:
                assemblies.append(own_fasta_path)
                your_own_fasta_lst.remove(own_fasta_path)
                continue
            # end if
        # end for

        # Include assemblies files in multi-fasta file

        # Find common prefix of all assembly paths and remove it from assembly names
        if len(assemblies) > 1:
            assemblies_formatted = tuple(
                map(lambda f: os.path.abspath(f).replace(os.sep, '-'),
                    assemblies))
            common_prefix = find_common_prefix(assemblies_formatted)
            assemblies_formatted = tuple(
                map(lambda f: f.replace(common_prefix, ''),
                    assemblies_formatted))
        elif len(assemblies) > 0:
            common_prefix = ''
            assemblies_formatted = tuple(map(os.path.basename, assemblies))
        # end if

        # Add assembled sequences to database
        with open(local_fasta, 'a') as fasta_db:
            for i, assm_path in enumerate(assemblies):
                printlog_info("Adding `{}` to database...".format(
                    os.path.basename(assm_path)))
                assm_name_fmt = assemblies_formatted[i]

                how_to_open = OPEN_FUNCS[is_gzipped(assm_path)]
                fmt_func = FORMATTING_FUNCS[is_gzipped(assm_path)]
                with how_to_open(assm_path) as fasta_file:
                    for line in fasta_file:
                        line = fmt_func(line)
                        # You can find comments to "OWN_SEQ..." below.
                        # Paths will be written to seq IDs in following way:
                        #   some-happy-path.fastq--
                        # in order to retrieve them securely with regex later.
                        if line.startswith('>'):
                            own_seq_counter += 1
                            own_acc = "OWN_SEQ_{}".format(own_seq_counter)
                            own_def = "{}--".format(
                                assm_name_fmt.replace(common_prefix,
                                                      '')) + line[1:]
                            own_def = remove_bad_chars(own_def)
                            taxonomy.save_taxonomy_directly(
                                taxonomy_path, own_acc, own_def)
                            line = ">" + "{} {}".format(own_acc, own_def)
                        # end if
                        fasta_db.write(line + '\n')
                    # end for
                # end with
            # end for
        # end with

        with open(local_fasta, 'a') as fasta_db:
            for own_fasta_path in your_own_fasta_lst:
                printlog_info("Adding `{}` to database...".format(
                    os.path.basename(own_fasta_path)))

                how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)]
                fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)]
                with how_to_open(own_fasta_path) as fasta_file:
                    for line in fasta_file:
                        line = fmt_func(line)
                        # 'makeblastdb' considers first word (sep. is space) as sequence ID
                        #   and throws an error if there are duplicated IDs.
                        # In order not to allow this duplication we'll create our own sequence IDs:
                        #   'OWN_SEQ_<NUMBER>' and write it in the beginning of FASTA record name.
                        if line.startswith('>'):
                            own_seq_counter += 1
                            own_acc = "OWN_SEQ_{}".format(own_seq_counter)
                            taxonomy.save_taxonomy_directly(
                                taxonomy_path, own_acc, line[1:])
                            line = ">" + own_acc + ' ' + remove_bad_chars(
                                line[1:])
                        # end if
                        fasta_db.write(line + '\n')
                    # end for
                # end with
            # end for
        # end with
    # end if

    # 'lcl|ACCESSION...' entries can be given with '.1'
    #   (or '.2', whatever) terminus by blastn.
    # There is no '.1' terminus in taxonomy file.
    # Therefore we will prune accessions in advance.
    print()
    printn("{} - Formatting accessions...".format(getwt()))
    log_info("Formatting accessions...")
    corrected_path = os.path.join(db_dir, "corrected_seqs.fasta")
    with open(local_fasta, 'r') as source_file, open(corrected_path,
                                                     'w') as dest_file:
        for line in source_file:
            if line.startswith('>'):
                line = line.strip()
                acc, seq_name = (line.partition(' ')[0],
                                 line.partition(' ')[2])
                acc = acc.partition('.')[0]
                seq_name = remove_bad_chars(seq_name)
                seq_name = re.sub(r'[^\x00-\x7F]+', '_',
                                  seq_name)  # remove non-ascii chars
                line = ' '.join((acc, seq_name)) + '\n'
            # end if
            dest_file.write(line)
        # end for
    # end with
    os.unlink(local_fasta)
    os.rename(corrected_path, local_fasta)
    sys.stdout.write("\r{} - Formatting accessions... ok".format(getwt()))
    log_info("Formatting accessions done.")

    # Configure command line
    make_db_cmd = "makeblastdb -in {} -parse_seqids -dbtype nucl".format(
        local_fasta)
    exit_code = os.system(make_db_cmd)  # make a blast-format database
    if exit_code != 0:
        printlog_error_time("Error occured while making the database")
        platf_depend_exit(exit_code)
    # end if

    print("\033[1A{} - Database is successfully created: `{}`\n".format(
        getwt(), local_fasta))
    log_info("Database is successfully created: `{}`".format(local_fasta))

    if use_index == "true":
        printlog_info_time("Database index creating started")
        # Configure command line
        make_index_cmd = "makembindex -input {} -iformat blastdb -verbosity verbose".format(
            local_fasta)
        exit_code = os.system(
            make_index_cmd)  # create an index for the database
        if exit_code != 0:
            printlog_info_time("Error occured while creating database index")
            platf_depend_exit(exit_code)
        # end if

        printlog_info_time("Database index has been successfully created")
    # end if

    # Gzip downloaded FASTA file
    printlog_info_time("Gzipping FASTA file: `{}`".format(local_fasta))

    if gzip_util_found:
        os.system("{} -v {}".format(gzip_util, local_fasta))
    else:
        # form .fasta.gz file 'by hand'
        with open(local_fasta,
                  'rb') as fasta_file, open_as_gzip(local_fasta + ".gz",
                                                    "wb") as fagz_file:
            shutil_copyfileobj(fasta_file, fagz_file)
        # end with
        os.unlink(local_fasta)  # remove source FASTA file, not the database
    # end if

    return local_fasta