示例#1
0
def get_res_tsv_fpath(new_dpath):
    # Function returns current TSV file. Binning will be performed according to this file.
    # :param new_dpath: current result directory;
    # :type new_dpath: str;

    is_similar_to_tsv_res = lambda f: True if f == "classification.tsv" else False

    if not os.path.exists(new_dpath):
        printlog_error_time(
            "Error: directory `{}` does not exist!".format(new_dpath))
        printlog_error("Please make sure you have performed taxonomic \
annotation of the following file: `{}` \
with `barapost-prober.py` and/or `barapost-local.py`".format(
            os.path.basename(new_dpath)))
        printlog_error(
            "Also this error might occur if you forget to specify result directory \
generated by `barapost-prober.py` with `-r` option.")
        platf_depend_exit(0)
    # end if

    # Recent file will be the first in sorted list
    tsv_res_fpath = list(
        filter(is_similar_to_tsv_res, sorted(os.listdir(new_dpath))))[0]

    return os.path.join(new_dpath, tsv_res_fpath)
示例#2
0
def create_result_directory(fq_fa_path, outdir_path):
    # Function creates a result directory named according
    #     to how source FASTQ or FASTA file is named.
    #
    # :param fq_fa_path: path to source FASTQ or FASTA file;
    # :type fq_fa_path: str;
    # :param outdir_path: path to directory in which result_directory will be created;
    # :type outdir_path: str;
    #
    # Returns 'str' path to the recently created result directory.

    # dpath means "directory path"
    new_dpath = os.path.join(
        outdir_path, os.path.basename(fq_fa_path))  # get rid of absolute path
    new_dpath = re.search(r"(.*)\.(m)?f(ast)?(a|q)(\.gz)?$",
                          new_dpath).group(1)  # get rid of extention
    if not os.path.exists(new_dpath):
        try:
            os.makedirs(new_dpath)
        except OSError as oserr:
            printlog_error_time(
                "Error: can't create result directory: `{}`".format(new_dpath))
            printlog_error(str(oserr))
            platf_depend_exit(1)
        # end try
    # end if
    return new_dpath
示例#3
0
def provide_open_funcs(fpaths):
    # Function, which returns opening function(s) for input file(s).
    #
    # :param fpaths: collection of paths to input files;
    # :type fpaths: list<str>;

    open_funcs = list()

    try:
        for fpath in fpaths:
            # Check if input file is gzipped
            if _is_gzipped(fpath):
                open_funcs.append(
                    functools.partial(gzip.open, mode='rt', encoding='utf-8'))
            # Check if input file is bzipped2
            elif _is_bzipped(fpath):
                open_funcs.append(
                    functools.partial(bz2.open, mode='rt', encoding='utf-8'))
            # Check if input file is plain text file
            elif _is_plain_text(fpath):
                open_funcs.append(
                    functools.partial(open, mode='r', encoding='utf-8'))
            else:
                # Raise a super terrifying exception
                raise _InvalidFileError('Error: cannot read file `{}`: \
it is neither plain text file, nor gzipped, nor bzipped2.'.format(fpath))
            # end if
        # end for
    except _InvalidFileError as err:
        printlog_error(str(err))
        platf_depend_exit(1)
    # end try

    return open_funcs
示例#4
0
def rename_file_verbosely(file):
    # Function verbosely renames file (as well as directory) given to it.
    # :param file: path to file (directory) meant to be renamed;
    # :type file: str;

    if not os.path.exists(file):
        return None
    # end if

    # Path to "file's" parent directory
    pardir = os.path.abspath(os.path.dirname(file))

    # Function can rename directories
    if os.path.isdir(file):
        is_analog = lambda f: not re.search(r"{}.*(_old_[0-9]+)?$"\
            .format(os.path.basename(file)), f) is None
        word = "directory"
        name_itself = file
        ext = ""
    else:
        is_analog = lambda f: re.search(r"(.*)\..*$", os.path.basename(file)
                                        ).group(1) in f
        word = "file"
        name_itself = re.search(r"(.*)\..*$", file).group(1)
        ext = re.search(r".*(\..*)$", file).group(1)
    # end if

    # Count files in 'pardir' that have analogous names as 'file' has:
    num_analog_files = len(list(filter(is_analog, os.listdir(pardir))))

    if re.search(r"_old_[0-9]+", file) is None:
        # Append "_old_<number>"
        new_name = name_itself + "_old_" + str(num_analog_files) + ext
    else:
        # Merely substitute new number
        new_name = file.replace(
            re.search(r"_old_([0-9]+)", file).group(1),
            str(num_analog_files + 1))
    # end if

    try:
        print()
        printlog_info(" - Renaming old {}:".format(word))
        printlog_info("  `{}` --> `{}`".format(file, new_name))
        os.rename(file, new_name)
    except OSError as err:
        printlog_error_time("Error: {} `{}` cannot be renamed:".format(
            word, str(file)))
        printlog_error(str(err))
        platf_depend_exit(1)
    # end try

    return new_name
示例#5
0
def remove_tmp_files(*paths):
    # Function removes files passed to it.
    # :param paths: an array-like collection of apth of files;
    # :type paths: list<str>;

    for path in paths:
        if os.path.exists(path):
            try:
                os.unlink(path)
            except OSError as oserr:
                printlog_error_time("Error: cannot remove file `{}`").format(
                    path)
                printlog_error(str(oserr))
                platf_depend_exit(1)
示例#6
0
def update_file_dict(srt_file_dict, new_fpath):
    try:
        if not new_fpath is None:
            srt_file_dict[sys.intern(new_fpath)] = open(new_fpath, 'a')
        else:
            srt_file_dict[new_fpath] = None  # handle no_trash
        # end if
    except OSError as oserr:
        printlog_error_time("Error occured while opening one of result files")
        printlog_error("Errorneous file: `{}`".format(new_fpath))
        printlog_error(str(oserr))
        platf_depend_exit(1)
    # end try
    return srt_file_dict
示例#7
0
def fastq_generator(fq_fpaths):
    # Function yields fastq records.
    # It does not create new FastqRecord object each time.
    # Instead it just updates extant object.
    # :param fq_fpaths: list ot paths to input fastq files;
    # :type fq_fpaths: list<str>, tuple<str>;
    # Yields list of FastqRecord-s, list<FastqRecord>.

    # Get open funtions for both files
    open_funcs = src.compression.provide_open_funcs(fq_fpaths)

    # Open input files and create FastqRecord objects for forward and reverse reads.
    fq_files = list()
    fq_records = list()
    for fpath, open_func in zip(fq_fpaths, open_funcs):
        fq_files.append(open_func(fpath))
        fq_records.append(FastqRecord(None, None, None, None))
    # end for

    eof = False

    while not eof:

        for fq_record, fq_file in zip(fq_records, fq_files):
            # Update FastqRecord
            fq_record.update_record(fq_file.readline().strip(),
                                    fq_file.readline().strip(),
                                    fq_file.readline().strip(),
                                    fq_file.readline().strip())
        # end for

        if fq_records[0].read_name == '':
            eof = True  # end of file
        else:
            # Validate fastq record(s)
            for fq_record in fq_records:
                error_response = fq_record.validate_fastq()
                if not error_response is None:
                    printlog_error('Fastq error: {}'.format(error_response))
                    platf_depend_exit(1)
                # end if
            # end for
            yield fq_records
        # end if
    # end while

    # Close input files.
    for fq_file in fq_files:
        fq_file.close()
示例#8
0
def look_around(new_dpath, fq_fa_path):
    # Function looks around in order to check if there are results from previous runs of this script.
    #
    # Returns None if there is no result from previous run.
    # If there are results from previous run, returns a dict of the following structure:
    # {
    #     "tsv_respath": path_to_tsv_file_from_previous_run (str),
    #     "n_done_reads": number_of_successfull_requests_from_currenrt_FASTA_file (int),
    # }
    #
    # :param new_dpath: path to current (corresponding to fq_fa_path file) result directory;
    # :type new_dpath: str;
    # :param fq_fa_path: path to current (corresponding to fq_fa_path file) FASTA file;
    # :type fq_fa_path: str;

    # "hname" means human readable name (i.e. without file path and extention)
    fasta_hname = os.path.basename(fq_fa_path)  # get rid of absolute path
    fasta_hname = re.search(r"(.*)\.(m)?f(ast)?a", fasta_hname).group(
        1)  # get rid of '.fasta' extention

    # Form path to result file
    tsv_res_fpath = os.path.join(new_dpath, "classification.tsv")

    num_done_reads = 0  # variable to keep number of succeffdully processed sequences

    if os.path.exists(tsv_res_fpath):

        with open(tsv_res_fpath, 'r') as res_file:
            # There can be invalid information in result file
            try:
                lines = res_file.readlines()
                num_done_reads = len(lines) - 1  # the first line is a head
            except OSError as err:
                printlog_error_time("Data in classification file `{}` is broken. Reason:"\
                    .format(tsv_res_fpath))
                printlog_error(str(err))
                printlog_error("Starting from the beginning.")
                rename_file_verbosely(tsv_res_fpath)
                return None
            # end try
        # end with
    else:
        return None
    # end if

    return {
        "tsv_respath": tsv_res_fpath,
        "n_done_reads": num_done_reads,
    }
示例#9
0
def recover_taxonomy(acc, hit_def, taxonomy_path):
    # Function recovers missing taxonomy by given accession.
    #
    # :param acc: accession of taxonomy entry to recover;
    # :type acc: str;
    # :param hit_def: name of this sequence;
    # :type hit_def: sre;
    # :param taxonomy_path: path to TSV file with taxonomy;
    # :type taxonomy_path: str;

    if acc == "LAMBDA":
        # If we are missing lambda phage taxonomy -- just add it
        save_taxonomy_directly(taxonomy_path, acc,
                               "Lambda-phage-nanopore-control")
    elif acc.startswith("OWN_SEQ_"):
        # If sequence is an "own seq" -- check fasta file

        # Get necessary title line from `local_seq_set.fasta`
        # Firstly find fasta file (it may be compressed)
        classif_dir = os.path.dirname(os.path.dirname(taxonomy_path))
        db_dir = os.path.join(classif_dir, "local_database")
        db_files = glob.glob("{}{}*".format(db_dir, os.sep))
        try:
            local_fasta = next(iter(filter(is_fasta, db_files)))
        except StopIteration:
            printlog_error_time(
                "Error: cannot recover taxonomy for following sequence:")
            printlog_error(" `{} - {}`.".format(acc, hit_def))
            printlog_error(
                "You can solve this problem by yourself (it's pretty simple).")
            printlog_error("Just add taxonomy line for {} to file `{}`".format(
                acc, taxonomy_path))
            printlog_error("  and run the program again.")
            platf_depend_exit(1)
        # end try

        # Find our line startingg with `acc`
        how_to_open = OPEN_FUNCS[is_gzipped(local_fasta)]
        fmt_func = FORMATTING_FUNCS[is_gzipped(local_fasta)]
        if is_gzipped(local_fasta):
            search_for = b">" + bytes(acc, 'ascii') + b" "
        else:
            search_for = ">{} ".format(acc)
        # end if

        with how_to_open(local_fasta) as fasta_file:
            for line in fasta_file:
                if line.startswith(search_for):
                    seq_name = fmt_func(line).partition(' ')[
                        2]  # get name of the sequence
                    save_taxonomy_directly(taxonomy_path, acc, seq_name)
                    break
                # end if
            # end for
        # end with
    else:
        # Try to find taxonomy in NCBI
        download_taxonomy(acc, hit_def, taxonomy_path)
示例#10
0
def search_for_related_replicons(acc_dict):
    # Function searches for replicons related to those in 'hits_to_download.tsv'
    #   of specified with '-s' option.
    # :param acc_dict: dictionary comntaining accession data of hits;
    # :type acc_dict: dict<str: tuple<str, str, int>>;

    print()
    printlog_info_time("Searching for related replicons...")

    start_accs = tuple(
        acc_dict.keys())  # accessions, which were "discovered" by prober

    for i, acc in enumerate(start_accs):

        printlog_info("{}. {} ({}):".format(i + 1, acc, acc_dict[acc]))

        # Search for related replicons:
        try:
            related_repls = _get_related_replicons(acc, acc_dict)
        except AttributeError:
            printlog_errot_time(
                "Parsing error: cannot find replicons related to {}.".format(
                    acc))
            printlog_error("Please, contact the developer")
            platf_depend_exit(1)
        else:
            related_repls = _deduplicate_replicons(related_repls, acc)
        # end try
        for rel_acc, rel_def in related_repls:
            acc_dict[rel_acc] = rel_def
        # end for
    # end for

    print()
    if len(start_accs) != len(acc_dict):  # there are some new replicons
        printlog_info_time("{} related replicons have been found.".\
            format(len(acc_dict) - len(start_accs)))
    else:
        printlog_info_time("No related replicons found.")
    # end if
    print()


# end def search_for_related_replicons
示例#11
0
    def whether_to_build_index(index_dirpath):
        # Function checks if there are any files in index directory.
        # If there are any, it asks a user whether to create a new index or to use old one.

        # :param index_dirpath: path to index directory;
        # :type index_dirpath: str;

        use_old_index = False

        if len(os.listdir(index_dirpath)) != 0:
            printlog_info(
                "Index file created by `-u` option already exists (left from previous run)."
            )

            error = True

            while error:
                reply = input("""  Press ENTER to make new index file
  or enter 'u' to use old index file:>>""")
                if reply == "":
                    try:
                        for path in glob(os.path.join(index_dirpath, '*')):
                            os.unlink(path)
                        # end for
                    except OSError as oserr:
                        printlog_error_time(
                            "Error: cannot remove old index files!")
                        printlog_error(str(oserr))
                        platf_depend_exit(1)
                    # end try
                    error = False
                elif reply == 'u':
                    use_old_index = True
                    error = False
                else:
                    print("Invalid reply!\n")
                # end if
            # end while
            printlog_info("You have chosen to {} index file.".format(
                "use old" if use_old_index else "make new"))
            print()
        # end if
        return use_old_index
示例#12
0
def launch_blastn(packet, blast_algorithm, use_index, queries_tmp_dir,
                  db_path):
    """
    Function launches 'blastn' utility from "BLAST+" toolkit and returns it's response.

    :param pacekt: FASTA data meant to be processend by 'blastn';
    :type packet: str;
    :param blast_algorithm: blastn algorithm to use;
    :type blast_algorithm: str;
    :param use_index: logic value inddicating whether to use index;
    :type use_index: bool:
    :param queries_tmp_dir: path to directory with query files;
    :type queries_tmp_dir: str:
    :param db_path: path to database;
    :type db_path: str:
    """

    # PID of current process won't change, so we can use it to mark query files.
    # 'paket's are too large to pass them to 'subprocess.Popen' as stdin,
    #    therefore we need to use these query files.
    query_path = os.path.join(queries_tmp_dir,
                              "query{}_tmp.fasta".format(os.getpid()))

    with open(query_path, 'w') as query_file:
        query_file.write(packet)
    # end with

    # Configure command line
    blast_cmd = "blastn -query {} -db {} -outfmt 5 -task {} -max_target_seqs 10 -max_hsps 1 -use_index {}"\
        .format(query_path, db_path, blast_algorithm, use_index)

    pipe = sp.Popen(blast_cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
    stdout_stderr = pipe.communicate()

    if pipe.returncode != 0:
        printlog_error_time(
            "Error occured while aligning a sequence against local database")
        printlog_error(stdout_stderr[1].decode("utf-8"))
        platf_depend_exit(pipe.returncode)
    # end if

    return stdout_stderr[0].decode("utf-8")
示例#13
0
def copy_single_f5(from_f5, read_name, to_f5):
    # Function copies a read with ID 'read_name'
    #     from 'from_f5' singleFAST5 file to 'to_f5' multiFAST5 one.
    #
    # :param from_f5: FAST5 file object to copy a read from;
    # :type from_f5: h5py.File;
    # :param read_name: ID of a read to copy;
    # :type read_name: str;
    # :param to_f5: destination FAST5 file;
    # :type to_f5: h5py.File;

    # Handle no_trash
    if to_f5 is None:
        return
    # end if

    try:
        read_group = read_name
        to_f5.create_group(read_group) # create group in destination multi_FAST5 file

        # Copy "UniqueGlobalKey" to root of recently created group
        for ugk_subgr in from_f5["UniqueGlobalKey"]:
            from_f5.copy("UniqueGlobalKey/"+ugk_subgr, to_f5[read_group])
        # end for

        # Get data array in single-FAST5 file
        read_number_group = "Raw/Reads/"+next(iter(from_f5["Raw"]["Reads"]))
        # It's name in multi-FAST5 file
        read_number = re.search(r"(Read_[0-9]+)", read_number_group).group(1)

        # Copy group to multi-FAST5 file
        from_f5.copy(from_f5[read_number_group], to_f5[read_group])
        # Move data array to "Raw" group, as it is in multi-FAST5 files
        to_f5.move("{}/{}".format(read_group, read_number), "{}/Raw".format(read_group))

        # Copy everything else to recently created group
        for group in from_f5:
            if group != "Raw" and group != "UniqueGlobalKey":
                from_f5.copy(group, to_f5["/{}".format(read_group)])
            # end if
        # end for
    except ValueError as err:
        printlog_error_time("Error: `{}`".format( str(err) ))
        printlog_error("Reason is probably the following:")
        printlog_error("  read that is copying to the result file is already in this file.")
        printlog_error("ID of the read: `{}`".format(read_name))
        printlog_error("File: `{}`".format(to_f5.filename))
        return
示例#14
0
def verify_taxids(taxid_list):
    # Funciton verifies TaxIDs passed to prober with `-g` option.
    # Function requests NCBI Taxonomy Browser and parses organism's name from HTML response.
    # What is more, this function configures `oraganisms` list - it will be included into BLAST submissions.
    #
    # :param taxid_list: list of TaxIDs. TaxIDs are strings, but they are verified to be integers
    #     during CL argument parsing;
    # :type taxid_list: list<str>;
    #
    # Returns list of strings of the following format: "<tax_name> (taxid:<TaxID>)>"

    organisms = list()
    if len(taxid_list) > 0:

        printlog_info("Verifying TaxIDs:")
        for taxid in taxid_list:
            printn("   {} - ".format(taxid))
            try:
                tax_resp = lingering_https_get_request(
                    "www.ncbi.nlm.nih.gov",
                    "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}".format(
                        taxid), "taxonomy")
                tax_name = re.search(r"Taxonomy browser \((.+?)\)",
                                     tax_resp).group(1)
            except AttributeError:
                printlog_error("\aError: TaxID not found")
                printlog_error(
                    "Please, check your TaxID: https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi"
                )
                platf_depend_exit(1)
            except OSError as oserr:
                printlog_error("Something is wrong with connection:")
                printlog_error(str(oserr))
                platf_depend_exit(-2)
            else:
                print(tax_name)
                log_info("{} - {}".format(taxid, tax_name))
                organisms.append("{} (taxid:{})".format(tax_name, taxid))
            # end try
        # end for
        print('-' * 30 + '\n')

    # end if
    return organisms
示例#15
0
def copy_read_f5_2_f5(from_f5, read_name, to_f5):
    # Function copies a read with ID 'read_name'
    #     from 'from_f5' multiFAST5 file to to_f5 multiFAST5 one.
    #
    # :param from_f5: FAST5 file object to copy a read from;
    # :type from_f5: h5py.File;
    # :param read_name: ID of a read to copy;
    # :type read_name: str;
    # :param to_f5: destination FAST5 file;
    # :type to_f5: h5py.File;

    if not to_f5 is None: # handle no_trash
        try:
            from_f5.copy(read_name, to_f5)
        except ValueError as err:
            printlog_error_time("Error: `{}`".format( str(err) ))
            printlog_error("Reason is probably the following:")
            printlog_error("  read that is copying to the result file is already in this file.")
            printlog_error("ID of the read: `{}`".format(read_name))
            printlog_error("File: `{}`".format(to_f5.filename))
            return
示例#16
0
def get_primers_seqs(primers_fpath):
    # Function for for obtaining primer sequence(s).
    # If primer_fpath is None, it returns refault primers:
    #   Illumina 16S V3-V4 primers.
    # Otherwise it parses primers from provided fasta file.

    # Use Illumina V3-V4 primers by dafault
    if primers_fpath is None:
        primers = ('CCTACGGGNGGCWGCAG', 'GACTACHVGGGTATCTAATCC')
    else:
        primers = list()

        # Get lines
        try:
            with open(primers_fpath, 'r') as primers_file:
                lines = primers_file.readlines()
            # end with
        except OSError as oserror:
            printlog_error('Error while reading file of primers: {}'\
                .format(oserror))
            platf_depend_exit(1)
        # end try

        # Remove blank lines
        lines = list(filter(lambda x: x != '', lines))

        # There must be 1 or 2 primers in primers file.
        if len(lines) not in (2, 4):
            printlog_error('Error: invalid format of primers file.\
It should be single (2 lines at all) or "double" (4 lines at all) fasta file.\
Bu there are {} lines in your file.'.format(len(lines)))
            platf_depend_exit(1)
        # end if

        bases = 'AGCTUTYSWKMBDHVN'

        # Validate sequence(s).
        for i in range(1, len(lines), 2):
            seq = lines[i].strip().upper()
            if re.match(r'[{}]+'.format(bases), seq) is None:
                printlog_error('Error: invalid character in primer sequence.\
Here is invalid primer sequence: `{}`. Permitted characters: `{}`'\
                    .format(seq, bases))
                platf_depend_exit(1)
            # end if
            primers.append(seq)
        # end for
    # end if

    return primers
示例#17
0
def lingering_https_get_request(server, url, request_for=None, acc=None):
    # Function performs a "lingering" HTTPS request.
    # It means that the function tries to get the response
    #     again and again if the request fails.
    #
    # :param server: server address;
    # :type server: str;
    # :param url: the rest of url;
    # :type url: str;
    # :param request_for: some comment for error message;
    # :type request_for: str;
    # :param acc: GenBank accession;
    # :type acc: str;
    #
    # Returns obtained response coded in UTF-8 ('str').

    error = True

    # We can get spurious 404 or sth due to instability of NCBI servers work.
    # Let's give it 3 attempts (with 15 sec spans in between),
    #   and if all them are unsuccessful -- teminate execution.
    attempt_i = 0
    max_attempts = 3

    while error:
        try:
            conn = http.client.HTTPSConnection(server,
                                               timeout=30)  # create connection
            conn.request("GET", url)  # ask for if there areresults
            response = conn.getresponse()  # get the resonse

            if response.code != 200:
                if attempt_i < max_attempts and "ncbi.nlm.nih.gov" in server:
                    printlog_error("Error {}: {}.".format(
                        response.code, response.reason))
                    printlog_error(
                        "It may be due to instable work of NCBI servers.")
                    printlog_error("{} attempts to connect left, waiting 15 sec..."\
                        .format(max_attempts - attempt_i))
                    attempt_i += 1
                else:
                    printlog_error("Cannot find {} for {}.".format(
                        request_for, acc))
                    printlog_error("Request failed with status code {}: {}"\
                        .format(response.code, response.reason))
                    platf_depend_exit(1)
                # end if
            # end if

            resp_content = str(response.read(), "utf-8")  # get response text
        except (OSError,\
                http.client.RemoteDisconnected,\
                socket.gaierror,\
                http.client.CannotSendRequest) as err:
            comment_str = ""
            if not request_for is None:
                comment_str += " requesting for {}".format(request_for)
                if not acc is None:
                    comment_str += " (accession: `{}`)".format(acc)
                # end if
                comment_str += '.'
            # end if
            print()
            printlog_info("Can't connect to `{}`{}".format(
                server + url, comment_str))
            printlog_info(str(err))
            printlog_info(
                """the program will sleep for 30 seconds and try to connect again."""
            )
            sleep(30)
        else:
            error = False  # if no exception ocured, get out of the loop
        finally:
            conn.close()
        # end try
    # end while
    return resp_content
def _reformat_legacy_file(legacy_tax_path):

    import shelve

    # Check if this file is corrupted
    try:
        with shelve.open(legacy_tax_path, 'r') as tax_file:
            pass
        # end with
    except OSError as err:
        printlog_error("Legacy taxonomy file appears to be corrupted.")
        printlog_error("This error might be fatal.")
        str_err = str(err)
        if "dbm.gnu" in str_err and "module is not" in str_err:
            printlog_error("Installing `python3-gdbm` might solve this problem.")
        else:
            printlog_error("The program can't recover taxonomy from the broken file.")
            printlog_error("Seems, you have to annotate your sequences again.")
            printlog_error("Sorry for that :(")
        # end if
        platf_depend_exit(1)
    # end try

    new_tax_path = "{}.tsv".format(legacy_tax_path)

    taxonomy.init_tax_file(new_tax_path)

    printn("Reformatting: `{}` ->".format(legacy_tax_path))
    log_info("Reformatting: `{}` ->".format(legacy_tax_path))

    with shelve.open(legacy_tax_path, 'r') as old_tax_file, open(new_tax_path, 'w') as new_tax_file:
        for acc, taxonomy_from_file in old_tax_file.items():
            if isinstance(taxonomy_from_file, tuple):
                tax_str = taxonomy.config_taxonomy_str(taxonomy_from_file)
                new_tax_file.write("{}\n".format('\t'.join( (acc, tax_str) )))
            elif isinstance(taxonomy_from_file, str):
                new_tax_file.write("{}\n".format('\t'.join( (acc, taxonomy_from_file) )))
            else:
                # Execution must not reach here
                printlog_error_time("Fatal error 8755.")
                printlog_error("Please, contact the developer.")
                platf_depend_exit(8755)
            # end if
        # end for
    # end with

    printlog_info(" `<same_dir>/{}`".format(os.path.basename(new_tax_path)))

    try:
        renamed_legacy_file = "{}_deprecated".format(legacy_tax_path)
        os.rename(legacy_tax_path, renamed_legacy_file)
    except OSError as err:
        printlog_error_time("Cannot rename legacy taxonomy file `{}`:".format(legacy_tax_path))
        printlog_error(str(err))
        printlog_error("But it's not a problem -- we will proceed with our work.")
    else:
        printlog_info("Renamed: `{}` -> `<same_dir>/{}`".format(legacy_tax_path,
            os.path.basename(renamed_legacy_file)))
    # end try

    printlog_info("Legacy taxonomy file is reformatted to TSV format.")
示例#19
0
def look_around(outdir_path, new_dpath, infile_path, blast_algorithm, acc_dict,
                probing_batch_size):
    # Function looks around in order to check if there are results from previous run(s) of this script
    #   in order to resume the previous run.
    #
    # Returns None if there is no result from previous run.
    # If there are results from previous run, returns a dict of the following structure:
    # {
    #     "RID": saved_RID <str>,
    #     "packet_size_save": saved packet size <int>,
    #     "packet_size_mode": saved packet mode <int>,
    #     "tsv_respath": path_to_tsv_file_from_previous_run <str>,
    #     "n_done_reads": number_of_successfull_requests_from_currenrt_FASTA_file <int>,
    #     "tmp_fpath": path_to_pemporary_file <str>,
    #     "decr_pb": valuse decreasing size of probing batch (see below, where this variable is defined) <int>
    # }
    #
    # :param outdir_path: path to output directory;
    # :type outdir_path: str;
    # :param new_dpath: path to current (corresponding to fq_fa_path file) result directory;
    # :type new_dpath: str;
    # :param infile_path: path to current (corresponding to fq_fa_path file) FASTA file;
    # :type infile_path: str;
    # :param blast_algorithm: BLASTn algorithm to use.
    #     This parameter is necessary because it is included in name of result files;
    # :param acc_dict: dictionary of accession info of hits;
    # :type acc_dict: dict<str: tuple<str, str, int>>;
    # :param probing_batch_size: amount of sequences meant to be processed in a single run;
    # :type probing_batch_size: str;
    # :type blast_algorithm: str;

    # "hname" means human readable name (i.e. without file path and extention)
    fasta_hname = os.path.basename(infile_path)  # get rid of absolute path
    fasta_hname = re.search(r"(.*)\.(m)?f(ast)?a", fasta_hname).group(
        1)  # get rid of `.fasta` extention

    # Form path to temporary file
    tmp_fpath = "{}_{}_temp.txt".format(os.path.join(new_dpath, fasta_hname),
                                        blast_algorithm)
    # Form path to result file
    tsv_res_fpath = os.path.join(new_dpath, "classification.tsv")
    # Form path to file with hits to download
    acc_fpath = os.path.join(outdir_path, "hits_to_download.tsv")

    num_done_seqs = 0  # variable to keep number of successfully processed sequences

    resume = None
    # Check if there are results from previous run.
    if os.path.exists(tsv_res_fpath) or os.path.exists(tmp_fpath):
        print()
        printlog_info(
            "A result file from previous run is found in the directory:")
        printlog_info("   `{}`".format(new_dpath))
        # Allow politely to continue from last successfully sent packet.
        resume = ask_for_resumption()
    # end if

    if not resume:
        rename_file_verbosely(tsv_res_fpath)
        rename_file_verbosely(tmp_fpath)
        rename_file_verbosely(acc_fpath)
    else:
        printlog_info("Let's try to resume...")

        # Collect information from result file
        if os.path.exists(tsv_res_fpath):
            # There can be invalid information in this file
            try:
                with open(tsv_res_fpath, 'r') as res_file:
                    lines = res_file.readlines()
                    num_done_seqs = len(lines) - 1  # the first line is a head
                    last_line = lines[-1]
                    last_seq_id = last_line.split('\t')[0]
                # end with
                # There must be 10 columns in each row:
                if any(map(lambda l: l.count('\t') != 9, lines)):
                    raise ValueError(
                        "There must be 10 colums separated by tabs in file `classification.tsv`"
                    )
                # end if

            except Exception as err:
                printlog_error_time(
                    "\nData in classification file `{}` not found or broken. Reason:"
                    .format(tsv_res_fpath))
                printlog_error(' ' + str(err))

                # If the reason is known -- print erroneous lines
                if isinstance(err, ValueError):
                    printlog_error("Here are numbers of improper lines:")
                    for i, line in enumerate(lines):
                        if line.count('\t') != 9:
                            printlog_error(str(i + 1) + ": `{}`".format(line))
                        # end if
                    # end for
                # end if

                # Ask a user if he/she wants to start from the beginning or to quit
                error = True
                while error:
                    reply = input("""Press ENTER to start from the beginning
  or enter `q` to quit:>> """)
                    if reply == "":
                        error = False
                        printlog_info(
                            "You have chosen to start from the beginning.\n")
                        rename_file_verbosely(tsv_res_fpath)
                        rename_file_verbosely(tmp_fpath)
                        rename_file_verbosely(acc_fpath)
                        return None
                    elif reply == 'q':
                        platf_depend_exit(0)
                    else:
                        print("! - Invalid reply: `{}`\n".format(reply))
                    # end if
                # end while
            else:
                printlog_info("Last classified sequence: " + last_seq_id)
                printlog_info(
                    "{} sequences have been already processed".format(
                        num_done_seqs))
            # end try
        # end if

        # Collect information from accession file
        if os.path.exists(acc_fpath):

            # There can be invalid information in this file
            try:
                with open(acc_fpath, 'r') as acc_file:
                    lines = acc_file.readlines()[
                        9:]  # omit description and head of the table
                    local_files_filtered = list(
                        filter(lambda x: False if os.path.exists(x) else True,
                               lines))  # omit file paths
                    for line in local_files_filtered:
                        vals = line.split('\t')
                        acc = sys.intern(vals[0].strip())
                        if len(vals) == 1:
                            acc_dict[acc] = [
                                "No definition of the sequence provided", 1
                            ]
                        elif len(vals) == 2:
                            acc_dict[acc] = [vals[1].strip(), 1]
                        else:
                            acc_dict[acc] = [
                                vals[1].strip(),
                                int(vals[2].strip())
                            ]
                        # end if
                    # end for
                # end with

            except Exception as err:
                printlog_error_time(
                    "Data in accession file `{}` not found or broken. Reason:".
                    format(acc_fpath))
                printlog_error(' ' + str(err))
                printlog_error("Invalid line: `{}`".format(line))

                # Ask a user if he/she wants to start from the beginning or to quit
                error = True
                while error:
                    reply = input("""Press ENTER to start from the beginning
  or enter `q` to quit:>> """)
                    if reply == "":
                        error = False
                        printlog_info(
                            "You have chosen to start from the beginning.\n")
                        rename_file_verbosely(tsv_res_fpath)
                        rename_file_verbosely(tmp_fpath)
                        rename_file_verbosely(acc_fpath)
                        return None
                    elif reply == 'q':
                        platf_depend_exit(0)
                    else:
                        print("! - Invalid reply: `{}`\n".format(reply))
                    # end if
                # end while
            else:
                print()
                printlog_info(
                    "Here are Genbank records encountered during previous run(s):"
                )
                for acc, other_info in sorted(acc_dict.items(),
                                              key=lambda x: -x[1][1]):
                    s_letter = "s" if other_info[1] > 1 else ""
                    printlog_info(" {} hit{} - {}, `{}`".format(
                        other_info[1], s_letter, acc, other_info[0]))
                # end for
                print('-' * 20)
            # end try
        # end if

        # Get packet size, number of the last sent packet and RID from temp file.
        # There can be invalid information in tmp file of tmp file may not exist
        try:

            with open(tmp_fpath, 'r') as tmp_file:
                temp_lines = tmp_file.readlines()
            # end with

            RID_save = re.search(r"Request_ID: (.+)",
                                 temp_lines[0]).group(1).strip()
            packet_size_save = int(
                re.search(r"Packet_size: ([0-9]*)",
                          temp_lines[1]).group(1).strip())
            packet_mode_save = int(
                re.search(r"Packet_mode: ([0-9]{1})",
                          temp_lines[2]).group(1).strip())

        except (AttributeError, OSError):

            # There is no need to disturb a user, merely proceed.
            return {
                "RID": None,
                "packet_size_save": None,
                "packet_mode_save": None,
                "tsv_respath": tsv_res_fpath,
                "n_done_reads": num_done_seqs,
                "tmp_fpath": tmp_fpath,
                "decr_pb": 0
            }
        else:
            # Let's assume that a user won't modify his/her brobing_batch size between erroneous runs:
            #   subtract num_done_reads if probing_batch_size > num_done_reads.
            decr_pb = num_done_seqs if num_done_seqs < probing_batch_size else 0
            # Return data from previous run
            return {
                "RID": RID_save,
                "packet_size_save": packet_size_save,
                "packet_mode_save": packet_mode_save,
                "tsv_respath": tsv_res_fpath,
                "n_done_reads": num_done_seqs,
                "tmp_fpath": tmp_fpath,
                "decr_pb": decr_pb
            }
        # end try
    # end if

    return None
示例#20
0
def download_taxonomy(hit_acc, hit_def, taxonomy_path):
    # Function retrieves taxonomy of a hit from NCBI.
    # Moreover, it saves this taxonomy in file ``taxonomy_tsv:
    #     <accession>\t<taxonomy_str>
    #
    # :param hit_acc: hit accession;
    # :type hit_acc: str;
    # :param hit_def: definition of reference record;
    # :type hit_def: str;
    # :param taxonomy_path: path to TSV file with taxonomy;
    # :type taxonomy_path: str;

    # Get TaxID of the organism from GenBank summary:
    gb_summary = lingering_https_get_request("www.ncbi.nlm.nih.gov",
                                             "/nuccore/{}".format(hit_acc),
                                             "GenBank summary", hit_acc)

    try:
        taxid = re.search(r"ORGANISM=([0-9]+)", gb_summary).group(1)
    except AttributeError:
        printlog_error_time(
            "Error: taxonomy parsing error 115-{}".format(hit_acc))
        printlog_error("Please, contact the developer.")
        platf_depend_exit(115)
    # end try

    # Get taxonomy page of the organism
    taxonomy_url = "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}&lvl=3&lin=f&keep=1&srchmode=1&unlock".format(
        taxid)
    taxonomy_text = lingering_https_get_request("www.ncbi.nlm.nih.gov",
                                                taxonomy_url, "taxonomy",
                                                hit_acc)

    # This pattern will match taxonomic names along with their ranks
    tax_rank_pattern = r"TITLE=\"([a-z ]+)\"\>([A-Z].+?)\</a\>"

    # Get all taxonomic names of the organism
    taxonomy = re.findall(tax_rank_pattern, taxonomy_text)

    # We will convert ranks to lowercase just in case.
    # Firstly convert tuples to lists in order to change them:
    taxonomy = list(map(lambda x: list(x), taxonomy))

    # Remove odd information from beginnig of names:
    for i in range(len(taxonomy)):
        taxonomy[i][0] = taxonomy[i][0].lower()  # just in case
    # end for

    # We will leave only following taxonomic ranks: domain, phylum, class, order, family, genus.
    # Species name requires special handling, it will be added later.
    ranks_to_select = ranks[:-1]

    # Remove redundant ranks:
    taxonomy = filter(lambda x: x[0].lower() in ranks_to_select, taxonomy)

    # Convert back to tuples:
    taxonomy = list(map(lambda x: tuple(x), taxonomy))

    # E.g., this record has no appropriate ranks: CP034535
    # Merely return it's definition
    if len(taxonomy) == 0:
        # Save taxonomy
        _tax_accs.append(hit_acc)
        with open(taxonomy_path, 'a') as tax_file:
            tax_file.write("{}\n".format('\t'.join((hit_acc, hit_def))))
        # end with
    # end if

    # Check if species name is specified like other ranks:
    check_direct_species_patt = r"TITLE=\"(species)\"\>([A-Za-z0-9 \.]+)\</a\>"
    match_direct_species = re.search(check_direct_species_patt, taxonomy_text)

    if not match_direct_species is None:
        # If species name is specified like other ranks, merely add it to list:
        taxonomy.append((match_direct_species.group(1),
                         match_direct_species.group(2).partition(" ")[2]))
    else:
        # Otherwise we need to parse species name from title
        title = re.search(r"\<title\>Taxonomy browser \((.+)\)\</title\>",
                          taxonomy_text).group(1)

        # Get words
        title = title.split(' ')

        # We will take all this words as species name.
        # Viruses also often have unpredictable names.
        #   Example: MN908947
        try:
            if title[1] in second_words_not_species or taxonomy[0][1].lower(
            ) == "viruses":
                taxonomy.append(("species", '_'.join(title[1:])))
            else:
                taxonomy.append(("species", title[1]))
            # end if
        except IndexError:
            # Handle absence of species name, e.g., this: AC150248.3
            # Well, nothing to append in this case!
            pass
        # end try
    # end if

    # Fill in missing ranks with empty strings
    for i in range(len(ranks)):
        if len(taxonomy) < i + 1:  # for this (missing in the end): AC150248
            taxonomy.append((ranks[i], ""))
        elif taxonomy[i][0] != ranks[
                i]:  # for this (mising in the middle): MN908947
            taxonomy.insert(i, (ranks[i], ""))
        # end if
    # end for

    # It will be a bit faster
    taxonomy = tuple(taxonomy)

    # Save taxonomy
    _tax_accs.append(hit_acc)
    with open(taxonomy_path, 'a') as tax_file:
        tax_file.write("{}\n".format('\t'.join(
            (hit_acc, config_taxonomy_str(taxonomy)))))
def map_f5reads_2_taxann(f5_path, tsv_taxann_lst, tax_annot_res_dir):
    # Function perform mapping of all reads stored in input FAST5 files
    #     to existing TSV files containing taxonomic annotation info.
    #
    # It creates an DBM index file.
    #
    # :param f5_path: path to current FAST5 file;
    # :type f5_path: str;
    # :param tsv_taxann_lst: list of path to TSV files that contain taxonomic annotation;
    # :type tsv_taxann_lst: list<str>;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;

    index_dirpath = os.path.join(
        tax_annot_res_dir,
        index_name)  # name of directory that will contain indicies

    # File validation:
    #   RuntimeError will be raised if FAST5 file is broken.
    try:
        # File existance checking is performed while parsing CL arguments.
        # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file.
        if not h5py.is_hdf5(f5_path):
            raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format")
        # end if

        f5_file = h5py.File(f5_path, 'r')

        for _ in f5_file:
            break
        # end for
    except RuntimeError as runterr:
        printlog_error_time("FAST5 file is broken")
        printlog_error("Reading the file `{}` crashed.".format(
            os.path.basename(f5_path)))
        printlog_error("Reason: {}".format(str(runterr)))
        printlog_error("Omitting this file...")
        print()
        return
    # end try

    readids_to_seek = list(fast5_readids(f5_file))
    idx_dict = dict()  # dictionary for index

    # This saving is needed to compare with 'len(readids_to_seek)'
    #    after all TSV will be looked through in order to
    #    determine if some reads miss taxonomic annotation.
    len_before = len(readids_to_seek)

    # Iterate over TSV-taaxnn file
    for tsv_taxann_fpath in tsv_taxann_lst:

        with open(tsv_taxann_fpath, 'r') as taxann_file:

            # Get all read IDs in current TSV
            readids_in_tsv = list(
                map(lambda l: l.split('\t')[0], taxann_file.readlines()))

            # Iterate over all other reads in current FAST5
            #    ('reversed' is necessary because we remove items from list in this loop)
            for readid in reversed(readids_to_seek):
                fmt_id = fmt_read_id(readid)[1:]
                if fmt_id in readids_in_tsv:
                    # If not first -- write data to dict (and to index later)
                    try:
                        idx_dict[tsv_taxann_fpath].append(
                            "read_" + fmt_id)  # append to existing list
                    except KeyError:
                        idx_dict[tsv_taxann_fpath] = ["read_" + fmt_id
                                                      ]  # create a new list
                    finally:
                        readids_to_seek.remove(readid)
                    # end try
                # end if
            # end for
        # end with
        if len(readids_to_seek) == 0:
            break
        # end if
    # end for

    # If after all TSV is checked but nothing have changed -- we miss taxonomic annotation
    #     for some reads! And we will write their IDs to 'missing_reads_lst.txt' file.
    if len(readids_to_seek) == len_before:
        printlog_error_time("reads from FAST5 file not found")
        printlog_error("FAST5 file: `{}`".format(f5_path))
        printlog_error("Some reads have not undergone taxonomic annotation.")
        missing_log = "missing_reads_lst.txt"
        printlog_error("List of missing reads are in following file:")
        printlog_error("{}".format(missing_log))
        with open(missing_log, 'w') as missing_logfile:
            missing_logfile.write(
                "Missing reads from file '{}':\n\n".format(f5_path))
            for readid in readids_to_seek:
                missing_logfile.write(fmt_read_id(readid) + '\n')
            # end for
        try:
            for path in glob(os.path.join(index_dirpath, '*')):
                os.unlink(path)
            # end for
            os.rmdir(index_dirpath)
        except OSError as oserr:
            printlog_error(
                "Error occured while removing index directory: {}".format(
                    oserr))
        finally:
            platf_depend_exit(3)
        # end try
    # end if

    try:
        # Open index files appending to existing data ('c' parameter)
        with open_shelve(os.path.join(index_dirpath, index_name),
                         'c') as index_f5_2_tsv:
            # Update index
            index_f5_2_tsv[f5_path] = idx_dict
        # end with
    except OSError as oserr:
        printlog_error_time("Error: cannot create index file `{}`"\
            .format(os.path.join(index_dirpath, index_name)))
        printlog_error(str(oserr))
        platf_depend_exit(1)
示例#22
0
def configure_acc_dict(acc_fpath, your_own_fasta_lst, accs_to_download):
    # Fucntion configures accession dictionary according to accession file generated by 'barapost-prober.py':
    #    keys are accessions, values are tuples of the following format:
    #     (<sequence_name_aka_definition>).
    #
    # :param acc_fpath: path to accession file generated by 'barapost-prober.py';
    # :type acc_fpath: str;
    # :param your_own_fasta_lst: list of paths to user's fasta files;
    # :type your_own_fasta_lst: list<str>;
    #
    # Returns accession dictionary described above.

    acc_dict = dict()

    # if database will be created only from 'your own' FASTA files -- return empty dict
    if not acc_fpath is None:

        with open(acc_fpath, 'r') as acc_file:
            lines = acc_file.readlines()

            for line_idx, line in enumerate(lines):
                line = line.strip()
                # Ignore ampty lines, commented lines and head of the table:
                if line != "" and not line.startswith(
                        '#') and not line.startswith("ACCESSION"):

                    line_splt = line.split('\t')
                    acc = sys.intern(line_splt[0].partition('.')[0])

                    if not re.match(GB_ACC_PATTERN, acc) is None:
                        # If we encounter GenBank accession number
                        try:
                            if len(line_splt) == 1:  # just accession
                                name = "No definition of the sequence provided"
                            else:
                                name = line_splt[1]
                            # end if
                            acc_dict[acc] = name
                        except IndexError as err:
                            printlog_error_time(
                                "Error: invalid data in file `{}`!".format(
                                    acc_fpath))
                            printlog_error(
                                "Here is that invalid line:\n  `{}`".format(
                                    line))
                            printlog_error(str(err))
                            platf_depend_exit(1)
                        # end try
                    else:
                        # It it's not a GenBank accession number,
                        #   probably it is a path to reference file.
                        if os.path.exists(line):
                            your_own_fasta_lst.append(line)
                        else:
                            printlog_error_time(
                                "Error in file `{}`.".format(acc_fpath))
                            printlog_error("Line #{} looks like path to reference file, but this file does not exist."\
                                .format(line_idx+1))
                            printlog_error(
                                "Here is this invalid line:\n  `{}`".format(
                                    line))
                            platf_depend_exit(1)
                        # end if
                    # end if
                # end if
            # end for
        # end with
    # end if

    if len(your_own_fasta_lst) == 0 and len(acc_dict) == 0 and len(
            accs_to_download) == 0:
        printlog_error_time(
            "Error: no accession information found in file `{}`".format(
                acc_fpath))
        platf_depend_exit(1)
    # end if

    return acc_dict
示例#23
0
def bin_fastqa_file(fq_fa_path, tax_annot_res_dir, sens, min_qual, min_qlen,
                    min_pident, min_coverage, no_trash):
    # Function for single-thread binning FASTQ and FASTA files.
    #
    # :param fq_fa_path: path to FASTQ (of FASTA) file meant to be processed;
    # :type fq_fa_path: str;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    srt_file_dict = dict(
    )  # dict containing file objects of existing output files

    new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir)
    tsv_res_fpath = get_res_tsv_fpath(new_dpath)
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")
    resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path)

    # Configure generator, write function and path to trash file
    if is_fastq(fq_fa_path):
        seq_records_generator = fastq_records
        write_fun = write_fastq_record
    else:
        seq_records_generator = fasta_records
        write_fun = write_fasta_record
    # end if

    # Make filter for quality and length
    QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen)
    # Configure path to trash file
    if not no_trash:
        QL_trash_fpath = get_QL_trash_fpath(
            fq_fa_path,
            outdir_path,
            min_qual,
            min_qlen,
        )
    else:
        QL_trash_fpath = None
    # end if

    # Make filter for identity and coverage
    align_filter = get_align_filter(min_pident, min_coverage)
    # Configure path to this trash file
    if not no_trash:
        align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path,
                                                  min_pident, min_coverage)
    else:
        align_trash_fpath = None
    # end if

    for fastq_rec in seq_records_generator(fq_fa_path):

        read_name = sys.intern(fmt_read_id(
            fastq_rec["seq_id"])[1:])  # get ID of the sequence

        try:
            hit_names, *vals_to_filter = resfile_lines[
                read_name]  # find hit corresponding to this sequence
        except KeyError:
            printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\
                .format(read_name))
            printlog_error("This TSV file: `{}`".format(tsv_res_fpath))
            printlog_error("Make sure that this read has been already \
                processed by `barapost-prober.py` and `barapost-local.py`.")
            platf_depend_exit(1)
        # end try

        # Apply filters
        if not QL_filter(vals_to_filter):
            QL_seqs_fail += 1
            # Place this sequence to QL trash file
            if QL_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath)
            # end if
            write_fun(srt_file_dict[QL_trash_fpath],
                      fastq_rec)  # write current read to binned file

        elif not align_filter(vals_to_filter):
            align_seqs_fail += 1
            # Place this sequence to align_trash file
            if align_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict,
                                                 align_trash_fpath)
            # end if
            write_fun(srt_file_dict[align_trash_fpath],
                      fastq_rec)  # write current read to binned file

        else:
            for hit_name in hit_names.split(
                    "&&"
            ):  # there can be multiple hits for single query sequence
                # Get name of result FASTQ file to write this read in
                binned_file_path = os.path.join(
                    outdir_path,
                    "{}.fast{}".format(hit_name,
                                       'q' if is_fastq(fq_fa_path) else 'a'))
                if binned_file_path not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict,
                                                     binned_file_path)
                # end if
                write_fun(srt_file_dict[binned_file_path],
                          fastq_rec)  # write current read to binned file
            # end for
            seqs_pass += 1
        # end if
    # end for

    # Close all binned files
    for file_obj in filter(lambda x: not x is None, srt_file_dict.values()):
        file_obj.close()
    # end for

    sys.stdout.write('\r')
    printlog_info_time("File `{}` is binned.".format(
        os.path.basename(fq_fa_path)))
    printn(" Working...")

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)
示例#24
0
printn("Primary validation...")
if not untwist_fast5:
    for fpath in fast5_list:
        # Get number of directories in 'tax_annot_res_dir' where results of current FAST5
        #    baraposting are located.
        possible_fast5_resdirs_num = len(
            glob("{}{}*{}*".format(tax_annot_res_dir, os.sep,
                                   get_checkstr(fpath))))

        if possible_fast5_resdirs_num == 1:
            continue  # OK
        elif possible_fast5_resdirs_num == 0:  # there is no such a directory
            print()
            printlog_error_time(
                "Error: classification for following FAST5 file is missing:")
            printlog_error("  `{}`".format(fpath))
            printlog_error(
                "Try running barapost-binning with `-u` (`--untwist-fast5`) flag."
            )
            print()
            platf_depend_exit(5)
        else:  # there are multiple directories where prober-barapost results can be located
            printlog_error_time(
                "Error: multiple result directories match FAST5 file meant to be binned"
            )
            printlog_error("File: `{}`".format(os.path.basename(fpath)))
            printlog_error("Directories:")
            for d in glob("{}{}*{}*".format(tax_annot_res_dir, os.sep,
                                            get_checkstr(fpath))):
                printlog_error(d)
            # end for
def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen,
                   min_pident, min_coverage, no_trash):
    # Function bins FAST5 file with untwisting.
    #
    # :param f5_path: path to FAST5 file meant to be processed;
    # :type f5_path: str;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    srt_file_dict = dict()

    index_dirpath = os.path.join(
        tax_annot_res_dir,
        index_name)  # name of directory that will contain indicies

    # Make filter for quality and length
    QL_filter = get_QL_filter(f5_path, min_qual, min_qlen)
    # Configure path to trash file
    if not no_trash:
        QL_trash_fpath = get_QL_trash_fpath(
            f5_path,
            outdir_path,
            min_qual,
            min_qlen,
        )
    else:
        QL_trash_fpath = None
    # end if

    # Make filter for identity and coverage
    align_filter = get_align_filter(min_pident, min_coverage)
    # Configure path to this trash file
    if not no_trash:
        align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path,
                                                  min_pident, min_coverage)
    else:
        align_trash_fpath = None
    # end if

    # File validation:
    #   RuntimeError will be raised if FAST5 file is broken.
    try:
        # File existance checking is performed while parsing CL arguments.
        # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file.
        if not h5py.is_hdf5(f5_path):
            raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format")
        # end if

        from_f5 = h5py.File(f5_path, 'r')

        for _ in from_f5:
            break
        # end for
    except RuntimeError as runterr:
        printlog_error_time("FAST5 file is broken")
        printlog_error("Reading the file `{}` crashed.".format(
            os.path.basename(f5_path)))
        printlog_error("Reason: {}".format(str(runterr)))
        printlog_error("Omitting this file...")
        print()
        # Return zeroes -- inc_val won't be incremented and this file will be omitted
        return (0, 0, 0)
    # end try

    # singleFAST5 and multiFAST5 files should be processed in different ways
    # "Raw" group always in singleFAST5 root and never in multiFAST5 root
    if "Raw" in from_f5.keys():
        f5_cpy_func = copy_single_f5
    else:
        f5_cpy_func = copy_read_f5_2_f5
    # end if

    readids_to_seek = list(from_f5.keys())  # list of not-binned-yet read IDs

    # Fill the list 'readids_to_seek'
    for read_name in fast5_readids(from_f5):
        # Get rid of "read_"
        readids_to_seek.append(sys.intern(read_name))
    # end for

    # Walk through the index
    index_f5_2_tsv = open_shelve(os.path.join(index_dirpath, index_name), 'r')

    if not f5_path in index_f5_2_tsv.keys():
        printlog_error_time(
            "Source FAST5 file `{}` not found in index".format(f5_path))
        printlog_error("Try to rebuild index")
        platf_depend_exit(1)
    # end if

    for tsv_path in index_f5_2_tsv[f5_path].keys():

        read_names = index_f5_2_tsv[f5_path][tsv_path]
        taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy",
                                     "taxonomy.tsv")
        resfile_lines = configure_resfile_lines(tsv_path, sens, taxonomy_path)

        for read_name in read_names:
            try:
                hit_names, *vals_to_filter = resfile_lines[sys.intern(
                    fmt_read_id(read_name)[1:])]
            except KeyError:
                printlog_error_time("Error: missing taxonomic annotation info for read `{}`"\
                    .format(fmt_read_id(read_name)[1:]))
                printlog_error(
                    "It is stored in `{}` FAST5 file".format(f5_path))
                printlog_error(
                    "Try to make new index file (press ENTER on corresponding prompt)."
                )
                printlog_error(
                    "Or, if does not work for you, make sure that taxonomic annotation info \
for this read is present in one of TSV files generated by `barapost-prober.py` and `barapost-local.py`."
                )
                index_f5_2_tsv.close()
                platf_depend_exit(1)
            # end try

            if not QL_filter(vals_to_filter):
                # Get name of result FASTQ file to write this read in
                if QL_trash_fpath not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict,
                                                     QL_trash_fpath)
                # end if
                f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath])
                QL_seqs_fail += 1
            elif not align_filter(vals_to_filter):
                # Get name of result FASTQ file to write this read in
                if align_trash_fpath not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict,
                                                     align_trash_fpath)
                # end if
                f5_cpy_func(from_f5, read_name,
                            srt_file_dict[align_trash_fpath])
                align_seqs_fail += 1
            else:
                for hit_name in hit_names.split(
                        "&&"
                ):  # there can be multiple hits for single query sequence
                    # Get name of result FASTQ file to write this read in
                    binned_file_path = os.path.join(
                        outdir_path, "{}.fast5".format(hit_name))
                    if binned_file_path not in srt_file_dict.keys():
                        srt_file_dict = update_file_dict(
                            srt_file_dict, binned_file_path)
                    # end if
                    f5_cpy_func(from_f5, read_name,
                                srt_file_dict[binned_file_path])
                # end for
                seqs_pass += 1
            # end if
        # end for

    from_f5.close()
    index_f5_2_tsv.close()

    # Close all binned files
    for file_obj in filter(lambda x: not x is None, srt_file_dict.values()):
        file_obj.close()
    # end for

    sys.stdout.write('\r')
    printlog_info_time("File `{}` is binned.".format(
        os.path.basename(f5_path)))
    printn(" Working...")

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)
示例#26
0
def bin_fastqa_file(fq_fa_lst, tax_annot_res_dir, sens, n_thr, min_qual,
                    min_qlen, min_pident, min_coverage, no_trash):
    # Function for parallel binning FASTQ and FASTA files.
    # Actually bins multiple files.
    #
    # :param fq_fa_lst: lsit of paths to FASTQ (of FASTA) file meant to be processed;
    # :type fq_fa_lst: list<str>;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    for fq_fa_path in fq_fa_lst:

        new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir)
        tsv_res_fpath = get_res_tsv_fpath(new_dpath)
        taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy",
                                     "taxonomy.tsv")
        resfile_lines = configure_resfile_lines(tsv_res_fpath, sens,
                                                taxonomy_path)

        # Configure path to trash file
        if is_fastq(fq_fa_path):
            seq_records_generator = fastq_records
            write_fun = write_fastq_record
        else:
            seq_records_generator = fasta_records
            write_fun = write_fasta_record
        # end if

        # Make filter for quality and length
        QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen)
        # Configure path to trash file
        if not no_trash:
            QL_trash_fpath = get_QL_trash_fpath(
                fq_fa_path,
                outdir_path,
                min_qual,
                min_qlen,
            )
        else:
            QL_trash_fpath = None
        # end if

        # Make filter for identity and coverage
        align_filter = get_align_filter(min_pident, min_coverage)
        # Configure path to this trash file
        if not no_trash:
            align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path,
                                                      min_pident, min_coverage)
        else:
            align_trash_fpath = None
        # end if

        # Create an iterator that will yield records
        seq_records_iterator = iter(seq_records_generator(fq_fa_path))
        # Dict for storing batches of sequences meant to be written to output files:
        to_write = dict()
        stop = False  # for outer while-loop

        while not stop:

            # Extract batch of records of 'n_thr' size and find their destination paths:
            for _ in range(n_thr):

                try:
                    fastqa_rec = next(seq_records_iterator)
                except StopIteration:
                    stop = True  # for outer while-loop
                    break
                # end try

                read_name = sys.intern(fmt_read_id(
                    fastqa_rec["seq_id"])[1:])  # get ID of the sequence

                try:
                    hit_names, *vals_to_filter = resfile_lines[
                        read_name]  # find hit corresponding to this sequence
                except KeyError:
                    printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\
                        .format(read_name))
                    printlog_error("This TSV file: `{}`".format(tsv_res_fpath))
                    printlog_error(
                        "Make sure that this read has been already processed by \
`barapost-prober.py` and `barapost-local.py`.")
                    platf_depend_exit(1)
                # end try

                # If read is found in TSV file:
                if not QL_filter(vals_to_filter):
                    # Place this sequence to QL trash file
                    to_write[read_name] = (fastqa_rec, QL_trash_fpath)
                    QL_seqs_fail += 1
                elif not align_filter(vals_to_filter):
                    # Place this sequence to QL trash file
                    to_write[read_name] = (fastqa_rec, align_trash_fpath)
                    align_seqs_fail += 1
                else:
                    for hit_name in hit_names.split("&&"):
                        # Get name of result FASTQ file to write this read in
                        binned_file_path = os.path.join(
                            outdir_path, "{}.fast{}".format(
                                hit_name,
                                'q' if is_fastq(fq_fa_path) else 'a'))
                        to_write[read_name] = (fastqa_rec, binned_file_path)
                    # end for
                    seqs_pass += 1
                # end if
            # end for

            # Write batch of records to output files:
            with write_lock:
                for record, fpath in to_write.values():
                    write_fun(fpath, record)
                # end for
            # end with
            to_write.clear()
        # end while

        with write_lock:
            # Write the rest of 'uneven' data to output files:
            if len(to_write) != 0:
                for record, fpath in to_write.values():
                    write_fun(fpath, record)
                # end for
            # end if
            sys.stdout.write('\r')
            printlog_info_time("File `{}` is binned.".format(
                os.path.basename(fq_fa_path)))
            printn(" Working...")
        # end with
    # end for

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)
示例#27
0
def send_request(request, pack_to_send, packet_size, packet_mode, filename,
                 tmp_fpath):
    # Function sends a request to "blast.ncbi.nlm.nih.gov/blast/Blast.cgi"
    #     and then waits for satisfaction of the request and retrieves response text.
    #
    # :param request: request_data (it is a dict that `configure_request()` function returns);
    # :param request: dict<dict>;
    # :param pack_to_send: current number (like id) of packet meant to be sent now.
    # :type pack_to_send: int;
    # :param pack_to_send: ordinal number of packet;
    # :type pack_to_send: int;
    # :param packet_size: numner of sequences in the packet;
    # :type packet_size: int;
    #
    # Returns XML text of type 'str' with BLAST response.

    payload = request["payload"]
    headers = request["headers"]

    server = "blast.ncbi.nlm.nih.gov"
    url = "/blast/Blast.cgi"
    error = True

    while error:
        try:
            conn = http.client.HTTPSConnection(server)  # create a connection
            conn.request("POST", url, payload, headers)  # send the request
            response = conn.getresponse()  # get the response
            response_text = str(response.read(), "utf-8")  # get response text
        except OSError as oserr:
            printlog_info_time(
                "`https://blast.ncbi.nlm.nih.gov` is not available.")
            printlog_info(str(oserr))
            printlog_info(
                "barapost will try to connect again in 30 seconds...\n")
            sleep(30)

        # if no exception occured
        else:
            error = False
        # end try
    # end while

    try:
        rid = re.search(r"RID = (.+)",
                        response_text).group(1)  # get Request ID
        rtoe = int(re.search(r"RTOE = ([0-9]+)", response_text).group(
            1))  # get time to wait provided by the NCBI server
    except AttributeError:
        printlog_error_time("Seems, NCBI has denied your request.")
        printlog_error("Response is in file `request_denial_response.html`")
        with open("request_denial_response.html", 'w') as den_file:
            den_file.write(response_text)
        # end with
        platf_depend_exit(1)
    finally:
        conn.close()
    # end try

    # Save temporary data
    with open(tmp_fpath, 'w') as tmpfile:
        tmpfile.write("Request_ID: {}\n".format(rid))
        tmpfile.write("Packet_size: {}\n".format(packet_size))
        tmpfile.write("Packet_mode: {}".format(packet_mode))
    # end with

    # Wait for results of alignment
    return wait_for_align(rid, rtoe, pack_to_send, filename)
示例#28
0
def configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path):
    # Function returns dictionary, where keys are sequence (i.e. sequences meant to be binned) IDs,
    #     and values are corresponding hit names.
    #
    # :param tsv_res_fpath: path to current TSV file. Binning will be performed accorfing to this TSV file;
    # :type tsv_res_fpath: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :parm taxonomy_path: path to taxonomy file;
    # :type taxonomy_file: str;

    resfile_lines = dict()

    tax_dict = src.taxonomy.get_tax_dict(taxonomy_path)

    with open(tsv_res_fpath, 'r') as brpst_resfile:

        brpst_resfile.readline()  # pass the head of the table
        line = brpst_resfile.readline().strip(
        )  # get the first informative line

        while line != "":
            splt = line.split('\t')
            read_name = sys.intern(splt[0])
            hit_name = splt[1]
            hit_acc = splt[2]

            try:
                quality = float(splt[8])  # we will filter by quality
            except ValueError as verr:
                if splt[8] == '-':
                    # Keep minus as quality if there is no quality information.
                    # Error will not be raised.
                    quality = splt[8]
                else:
                    printlog_error_time("query quality parsing error")
                    printlog_error(str(verr))
                    printlog_error("Please, contact the developer.")
                    platf_depend_exit(1)
                # end if
            # end try

            try:
                query_len = int(splt[3])  # we will filter by length
            except ValueError as verr:
                printlog_error_time("query length parsing error")
                printlog_error(str(verr))
                printlog_error("Please, contact the developer.")
                platf_depend_exit(1)
            # end try

            try:
                pident = float(splt[5])  # we will filter by identity
            except ValueError as verr:
                if splt[5] == '-':
                    # Keep minus as quality if there is no quality information.
                    # Error will not be raised.
                    pident = splt[5]
                else:
                    printlog_error_time(
                        "Alignment percent of identity parsing error")
                    printlog_error(str(verr))
                    printlog_error("Please, contact the developer.")
                    platf_depend_exit(1)
                # end if
            # end try

            try:
                coverage = float(splt[4])  # we will filter by coverage
            except ValueError as verr:
                if splt[4] == '-':
                    # Keep minus as quality if there is no quality information.
                    # Error will not be raised.
                    coverage = splt[4]
                else:
                    printlog_error_time("alignment coverage parsing error")
                    printlog_error(str(verr))
                    printlog_error("Please, contact the developer.")
                    platf_depend_exit(1)
                # end if
            # end try

            try:
                resfile_lines[read_name] = [
                    format_taxonomy_name(hit_acc, hit_name, sens, tax_dict),
                    quality, query_len, pident, coverage
                ]
            except NoTaxonomyError:
                printlog_warning(
                    "Can't find taxonomy for reference sequence `{}`".format(
                        hit_acc))
                printlog_warning("Trying to recover taxonomy.")

                # Recover
                src.taxonomy.recover_taxonomy(hit_acc, hit_name, taxonomy_path)
                printlog_info("Taxonomy for {} is recovered.".format(hit_acc))

                # Update tax_dict
                tax_dict = src.taxonomy.get_tax_dict(taxonomy_path)

                # Format again -- with new tax_dict
                resfile_lines[read_name] = [
                    format_taxonomy_name(hit_acc, hit_name, sens, tax_dict),
                    quality, query_len, pident, coverage
                ]
            # end try

            line = brpst_resfile.readline().strip()  # get next line
        # end while
    # end with

    return resfile_lines
示例#29
0
def format_taxonomy_name(hit_acc, hit_def, sens, tax_dict):
    # Function formats taxonomy name according to chosen sensibiliry of binning.
    #
    # :param hit_acc: accession(s) of best hit(s);
    # :type hit_acc: str;
    # :param hit_def: annotation of best hit;
    # :type hit_def: str;
    # :param sens: sensibility returned by 'get_classif_sensibility()' function.
    #     It's value can be one of the following strings: "genus", "species";
    # :type sens: str;
    # :param tax_dict: taxonomy dictionary returned by function 'src.taxonomy.get_tax_dict';
    # :type tax_dict: dict;
    #
    # Returns formatted hit name of 'str' type;

    # If there is no hit -- we are sure what to do!
    if hit_def == "No significant similarity found":
        return "unknown"
    # end if

    best_hit_annots = list(
    )  # list of strings that will be names of binned files

    for acc, annotation in zip(hit_acc.split('&&'), hit_def.split('&&')):

        # Get taxonomy
        try:
            taxonomy = tax_dict[acc]
        except KeyError:
            raise NoTaxonomyError()
        # end try

        # If it is beautiful tuple-formatted taxonomy -- find rank name for filename
        if isinstance(taxonomy, tuple):

            best_hit_annots.append(find_rank_for_filename(sens, taxonomy))
            if sens[0] == "species":
                genus_sens = ("genus", sens[1] - 1)
                genus_name = find_rank_for_filename(genus_sens, taxonomy)
                species_name = best_hit_annots[len(best_hit_annots) - 1]
                best_hit_annots[len(best_hit_annots) - 1] = "{}_{}".format(
                    genus_name, species_name)
            # end if

        # Otherwise consider sequence ID
        elif isinstance(taxonomy, str):

            # Check if hit is a sequence from SPAdes or a5 assembly:
            spades_match_obj = re.search(SPADES_PATT, annotation)
            a5_match_obj = re.search(A5_PATT, annotation)

            if not spades_match_obj is None:
                if sens[0] != "species":
                    contig_info = spades_match_obj.group(1)
                    taxonomy = taxonomy.replace('--' + contig_info, '')
                # end if
            elif not a5_match_obj is None:
                if sens[0] != "species":
                    contig_info = a5_match_obj.group(1)
                    taxonomy = taxonomy.replace('--' + contig_info, '')
                # end if
            # end if
            # If it is not assembly -- merely return taxonomy
            best_hit_annots.append(taxonomy)
        else:
            # Execution must not reach here
            printlog_error_time("Fatal error 8754.")
            printlog_error("Please, contact the developer.")
            platf_depend_exit(8754)
        # end if
    # end for

    # Replace symbols not allowed in filenames
    best_hit_annots = map(remove_bad_chars, best_hit_annots)

    # Return deduplicated names
    return "&&".join(set(best_hit_annots))
示例#30
0
def ngmerge_runner(args):
    # Runner function for NGmerge task.
    #
    # :param args: arguments for NGmerge task;
    # :type args: NGmergeArguments;
    #
    # Returns two collections:
    # 1. A collection of valid ("merged") paths.
    # 2. A collection of trash ("unmerged") paths.

    print()
    printlog_info_time('Running NGmerge..')

    # NGmerge puts result files into working directory --
    #   we will temporarily go to output directory
    old_dir = os.getcwd()
    os.chdir(args.outdir)

    # Conigure output files' names
    merged_basename, unmerged_prefix = ofn.get_ngmerge_outprefixes(
        args.infpaths[0])

    # Configure command
    ngmerge_cmd = '{} -1 {} -2 {} -o {} -f {} -n {} -v -m {} -p {} -q {}'\
        .format(args.ngmerge, args.infpaths[0], args.infpaths[1],
        merged_basename, unmerged_prefix, args.n_thr,
        args.min_overlap, args.mismatch_frac, args.phred_offset)
    printlog_info('Command: `{}`'.format(ngmerge_cmd))

    # Run NGmerge
    print('NGmerge is doing it\'s job silently...')
    pipe = sp.Popen(ngmerge_cmd, shell=True, stderr=sp.PIPE)
    stderr = pipe.communicate()[1].decode('utf-8')  # run NGmerge

    if pipe.returncode != 0:
        # error
        printlog_error('Error running NGmerge.: {}'.format(stderr))
        platf_depend_exit(pipe.returncode)
    # end if

    # Parse merging statistics from NGmerge's stderr
    stderr = stderr.splitlines()[1:]
    reads_pattern = r'Fragments \(pairs of reads\) analyzed: ([0-9]+)'
    merged_pattern = r'Successfully stitched: ([0-9]+)'

    # Collect statistics
    try:
        reads_processed = int(re.search(reads_pattern, stderr[0]).group(1))
        merged_reads = int(re.search(merged_pattern, stderr[1]).group(1))
    except (ValueError, AttributeError) as err:
        printlog_error(
            'Error 78 ({}). Please, contact the developer.'.format(err))
        platf_depend_exit(78)
    # end try

    os.chdir(old_dir)  # return to old dir

    printlog_info_time('NGmerge merged {}/{} ({}%) read pairs.'\
        .format(merged_reads, reads_processed,
            round(merged_reads / reads_processed * 100, 2)))

    # Configure absolute paths to output files.
    merged_fpath = os.path.join(args.outdir, merged_basename)
    unmerged_fpaths = sorted(
        glob.glob(
            os.path.join(args.outdir, '{}*.fastq'.format(unmerged_prefix))))

    # Oh yeah, first returned value must be a collection.
    return [merged_fpath], unmerged_fpaths