Exemplo n.º 1
0
def look_around(new_dpath, fq_fa_path):
    # Function looks around in order to check if there are results from previous runs of this script.
    #
    # Returns None if there is no result from previous run.
    # If there are results from previous run, returns a dict of the following structure:
    # {
    #     "tsv_respath": path_to_tsv_file_from_previous_run (str),
    #     "n_done_reads": number_of_successfull_requests_from_currenrt_FASTA_file (int),
    # }
    #
    # :param new_dpath: path to current (corresponding to fq_fa_path file) result directory;
    # :type new_dpath: str;
    # :param fq_fa_path: path to current (corresponding to fq_fa_path file) FASTA file;
    # :type fq_fa_path: str;

    # "hname" means human readable name (i.e. without file path and extention)
    fasta_hname = os.path.basename(fq_fa_path)  # get rid of absolute path
    fasta_hname = re.search(r"(.*)\.(m)?f(ast)?a", fasta_hname).group(
        1)  # get rid of '.fasta' extention

    # Form path to result file
    tsv_res_fpath = os.path.join(new_dpath, "classification.tsv")

    num_done_reads = 0  # variable to keep number of succeffdully processed sequences

    if os.path.exists(tsv_res_fpath):

        with open(tsv_res_fpath, 'r') as res_file:
            # There can be invalid information in result file
            try:
                lines = res_file.readlines()
                num_done_reads = len(lines) - 1  # the first line is a head
            except OSError as err:
                printlog_error_time("Data in classification file `{}` is broken. Reason:"\
                    .format(tsv_res_fpath))
                printlog_error(str(err))
                printlog_error("Starting from the beginning.")
                rename_file_verbosely(tsv_res_fpath)
                return None
            # end try
        # end with
    else:
        return None
    # end if

    return {
        "tsv_respath": tsv_res_fpath,
        "n_done_reads": num_done_reads,
    }
Exemplo n.º 2
0
def look_around(outdir_path, new_dpath, infile_path, blast_algorithm, acc_dict,
                probing_batch_size):
    # Function looks around in order to check if there are results from previous run(s) of this script
    #   in order to resume the previous run.
    #
    # Returns None if there is no result from previous run.
    # If there are results from previous run, returns a dict of the following structure:
    # {
    #     "RID": saved_RID <str>,
    #     "packet_size_save": saved packet size <int>,
    #     "packet_size_mode": saved packet mode <int>,
    #     "tsv_respath": path_to_tsv_file_from_previous_run <str>,
    #     "n_done_reads": number_of_successfull_requests_from_currenrt_FASTA_file <int>,
    #     "tmp_fpath": path_to_pemporary_file <str>,
    #     "decr_pb": valuse decreasing size of probing batch (see below, where this variable is defined) <int>
    # }
    #
    # :param outdir_path: path to output directory;
    # :type outdir_path: str;
    # :param new_dpath: path to current (corresponding to fq_fa_path file) result directory;
    # :type new_dpath: str;
    # :param infile_path: path to current (corresponding to fq_fa_path file) FASTA file;
    # :type infile_path: str;
    # :param blast_algorithm: BLASTn algorithm to use.
    #     This parameter is necessary because it is included in name of result files;
    # :param acc_dict: dictionary of accession info of hits;
    # :type acc_dict: dict<str: tuple<str, str, int>>;
    # :param probing_batch_size: amount of sequences meant to be processed in a single run;
    # :type probing_batch_size: str;
    # :type blast_algorithm: str;

    # "hname" means human readable name (i.e. without file path and extention)
    fasta_hname = os.path.basename(infile_path)  # get rid of absolute path
    fasta_hname = re.search(r"(.*)\.(m)?f(ast)?a", fasta_hname).group(
        1)  # get rid of `.fasta` extention

    # Form path to temporary file
    tmp_fpath = "{}_{}_temp.txt".format(os.path.join(new_dpath, fasta_hname),
                                        blast_algorithm)
    # Form path to result file
    tsv_res_fpath = os.path.join(new_dpath, "classification.tsv")
    # Form path to file with hits to download
    acc_fpath = os.path.join(outdir_path, "hits_to_download.tsv")

    num_done_seqs = 0  # variable to keep number of successfully processed sequences

    resume = None
    # Check if there are results from previous run.
    if os.path.exists(tsv_res_fpath) or os.path.exists(tmp_fpath):
        print()
        printlog_info(
            "A result file from previous run is found in the directory:")
        printlog_info("   `{}`".format(new_dpath))
        # Allow politely to continue from last successfully sent packet.
        resume = ask_for_resumption()
    # end if

    if not resume:
        rename_file_verbosely(tsv_res_fpath)
        rename_file_verbosely(tmp_fpath)
        rename_file_verbosely(acc_fpath)
    else:
        printlog_info("Let's try to resume...")

        # Collect information from result file
        if os.path.exists(tsv_res_fpath):
            # There can be invalid information in this file
            try:
                with open(tsv_res_fpath, 'r') as res_file:
                    lines = res_file.readlines()
                    num_done_seqs = len(lines) - 1  # the first line is a head
                    last_line = lines[-1]
                    last_seq_id = last_line.split('\t')[0]
                # end with
                # There must be 10 columns in each row:
                if any(map(lambda l: l.count('\t') != 9, lines)):
                    raise ValueError(
                        "There must be 10 colums separated by tabs in file `classification.tsv`"
                    )
                # end if

            except Exception as err:
                printlog_error_time(
                    "\nData in classification file `{}` not found or broken. Reason:"
                    .format(tsv_res_fpath))
                printlog_error(' ' + str(err))

                # If the reason is known -- print erroneous lines
                if isinstance(err, ValueError):
                    printlog_error("Here are numbers of improper lines:")
                    for i, line in enumerate(lines):
                        if line.count('\t') != 9:
                            printlog_error(str(i + 1) + ": `{}`".format(line))
                        # end if
                    # end for
                # end if

                # Ask a user if he/she wants to start from the beginning or to quit
                error = True
                while error:
                    reply = input("""Press ENTER to start from the beginning
  or enter `q` to quit:>> """)
                    if reply == "":
                        error = False
                        printlog_info(
                            "You have chosen to start from the beginning.\n")
                        rename_file_verbosely(tsv_res_fpath)
                        rename_file_verbosely(tmp_fpath)
                        rename_file_verbosely(acc_fpath)
                        return None
                    elif reply == 'q':
                        platf_depend_exit(0)
                    else:
                        print("! - Invalid reply: `{}`\n".format(reply))
                    # end if
                # end while
            else:
                printlog_info("Last classified sequence: " + last_seq_id)
                printlog_info(
                    "{} sequences have been already processed".format(
                        num_done_seqs))
            # end try
        # end if

        # Collect information from accession file
        if os.path.exists(acc_fpath):

            # There can be invalid information in this file
            try:
                with open(acc_fpath, 'r') as acc_file:
                    lines = acc_file.readlines()[
                        9:]  # omit description and head of the table
                    local_files_filtered = list(
                        filter(lambda x: False if os.path.exists(x) else True,
                               lines))  # omit file paths
                    for line in local_files_filtered:
                        vals = line.split('\t')
                        acc = sys.intern(vals[0].strip())
                        if len(vals) == 1:
                            acc_dict[acc] = [
                                "No definition of the sequence provided", 1
                            ]
                        elif len(vals) == 2:
                            acc_dict[acc] = [vals[1].strip(), 1]
                        else:
                            acc_dict[acc] = [
                                vals[1].strip(),
                                int(vals[2].strip())
                            ]
                        # end if
                    # end for
                # end with

            except Exception as err:
                printlog_error_time(
                    "Data in accession file `{}` not found or broken. Reason:".
                    format(acc_fpath))
                printlog_error(' ' + str(err))
                printlog_error("Invalid line: `{}`".format(line))

                # Ask a user if he/she wants to start from the beginning or to quit
                error = True
                while error:
                    reply = input("""Press ENTER to start from the beginning
  or enter `q` to quit:>> """)
                    if reply == "":
                        error = False
                        printlog_info(
                            "You have chosen to start from the beginning.\n")
                        rename_file_verbosely(tsv_res_fpath)
                        rename_file_verbosely(tmp_fpath)
                        rename_file_verbosely(acc_fpath)
                        return None
                    elif reply == 'q':
                        platf_depend_exit(0)
                    else:
                        print("! - Invalid reply: `{}`\n".format(reply))
                    # end if
                # end while
            else:
                print()
                printlog_info(
                    "Here are Genbank records encountered during previous run(s):"
                )
                for acc, other_info in sorted(acc_dict.items(),
                                              key=lambda x: -x[1][1]):
                    s_letter = "s" if other_info[1] > 1 else ""
                    printlog_info(" {} hit{} - {}, `{}`".format(
                        other_info[1], s_letter, acc, other_info[0]))
                # end for
                print('-' * 20)
            # end try
        # end if

        # Get packet size, number of the last sent packet and RID from temp file.
        # There can be invalid information in tmp file of tmp file may not exist
        try:

            with open(tmp_fpath, 'r') as tmp_file:
                temp_lines = tmp_file.readlines()
            # end with

            RID_save = re.search(r"Request_ID: (.+)",
                                 temp_lines[0]).group(1).strip()
            packet_size_save = int(
                re.search(r"Packet_size: ([0-9]*)",
                          temp_lines[1]).group(1).strip())
            packet_mode_save = int(
                re.search(r"Packet_mode: ([0-9]{1})",
                          temp_lines[2]).group(1).strip())

        except (AttributeError, OSError):

            # There is no need to disturb a user, merely proceed.
            return {
                "RID": None,
                "packet_size_save": None,
                "packet_mode_save": None,
                "tsv_respath": tsv_res_fpath,
                "n_done_reads": num_done_seqs,
                "tmp_fpath": tmp_fpath,
                "decr_pb": 0
            }
        else:
            # Let's assume that a user won't modify his/her brobing_batch size between erroneous runs:
            #   subtract num_done_reads if probing_batch_size > num_done_reads.
            decr_pb = num_done_seqs if num_done_seqs < probing_batch_size else 0
            # Return data from previous run
            return {
                "RID": RID_save,
                "packet_size_save": packet_size_save,
                "packet_mode_save": packet_mode_save,
                "tsv_respath": tsv_res_fpath,
                "n_done_reads": num_done_seqs,
                "tmp_fpath": tmp_fpath,
                "decr_pb": decr_pb
            }
        # end try
    # end if

    return None
Exemplo n.º 3
0
   or enter `r` to rename old directory and to write current results to a new one
   or enter `a` to append new sequences to existing data:>>""")

        if reply == "":
            invalid_reply = False

            printlog_info("You have chosen to remove old files.")
            remove_tmp_files(
                *filter(is_fastQA5, glob(os.path.join(outdir_path, '*'))))
        elif reply == 'r':
            invalid_reply = False

            printlog_info("You have chosen to rename old directory.")

            from src.filesystem import rename_file_verbosely
            new_name_for_old_dir = rename_file_verbosely(outdir_path)

            # Create new dir
            try:
                os.makedirs(outdir_path)
                # Restore log file from 'new_name_for_old_dir'
                os.rename(
                    os.path.join(new_name_for_old_dir,
                                 os.path.basename(logfile_path)), logfile_path)
            except OSError as err:
                printlog_error_time("Filesystem error: {}".format(err))
                platf_depend_exit(1)
            # end try

        elif reply == 'a':
            invalid_reply = False
Exemplo n.º 4
0
def build_local_db(tax_annot_res_dir, acc_fpath, your_own_fasta_lst,
                   accs_to_download, use_index):
    # Function creates a database with utilities from 'blast+' toolkit
    #     according to acc_dict and your_own_fasta_lst.
    #
    # :param tax_annot_res_dir: path to current result directory
    #   (each processed file has it's own result directory);
    # :type tax_annot_res_dir: str;
    # :param acc_fpath: path to file "hits_to_download.tsv";
    # :type acc_fpath: str;
    # :param your_own_fasta_lst: list of user's fasta files to be included in database;
    # :type your_own_fasta_lst: list<str>;
    # :param accs_to_download: list of accessions from command line ('-s');
    # :type accs_to_download: list<str>;
    # :param use_index: whether to use index;
    # :type use_index: str;

    # Returns path to created database.

    # Path to directory in which database will be placed
    db_dir = os.path.join(tax_annot_res_dir, "local_database")
    # Path to DBM taxonomy file
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")

    try:
        os.makedirs(db_dir)
    except OSError:
        #If this directory exists

        while True:
            if len(os.listdir(db_dir)) == 0:
                # If db directory is empty -- break and build a database
                break
            else:
                print()
                printlog_info("Database directory is not empty:")
                printlog_info("  `{}`".format(os.path.abspath(db_dir)))
                printlog_info("Here is it's content:")
                for i, fname in enumerate(os.listdir(os.path.abspath(db_dir))):
                    printlog_info(" {}. `{}`".format(i + 1, fname))
                # end for
                reply = input(
                    """\nPress ENTER to start classification using existing database.
Enter 'r' to remove all files in this directory and create the database from the beginning:>>"""
                )

                if reply == "":
                    # Do not build a database, just return path to it.
                    printlog_info("You have chosen to use extant database.")

                    # Return path to DB located in this directory
                    dbpath = next(iter(os.listdir(db_dir)))
                    dbpath = dbpath.partition(".fasta")[0] + dbpath.partition(
                        ".fasta")[1]  # remove all after '.fasta'

                    return os.path.join(db_dir, dbpath)

                elif reply == 'r':

                    printlog_info("You have chosen to rebuild the database.")
                    # Rename old classification files and write actual data to new one:
                    old_classif_dirs = filter(
                        lambda d: os.path.exists(
                            os.path.join(d, "classification.tsv")),
                        glob(os.path.join(tax_annot_res_dir, "*")))
                    old_classif_files = tuple(
                        map(lambda f: os.path.join(f, "classification.tsv"),
                            old_classif_dirs))

                    if len(old_classif_files) > 0:
                        print()
                        printlog_info("Renaming old classification files:")
                        for classif_file in old_classif_files:
                            rename_file_verbosely(classif_file)
                        # end for
                    # end if

                    # Empty database directory
                    for file in glob("{}{}*".format(db_dir, os.sep)):
                        os.unlink(file)
                    # end for

                    # Break from the loop in order to build a database
                    break
                else:
                    print("Invalid reply: `{}`\n".format(reply))
                    continue
                # end if
            # end if
        # end while
    # end try

    # It is a dictionary of accessions and record names.
    # Accessions are keys, record names are values.
    acc_dict = configure_acc_dict(acc_fpath, your_own_fasta_lst,
                                  accs_to_download)

    if len(accs_to_download) != 0:
        verify_cl_accessions(accs_to_download, acc_dict)
    # end if

    # Retrieve already existing taxonomy data from taxonomy file
    tax_exist_accs = taxonomy.get_tax_keys(taxonomy_path)

    # If accession file does not exist and execution has reached here -- everything is OK --
    #    we are building a database from user's files only.
    if len(acc_dict) != 0:
        print()

        print("""Following sequences (and all replicons related to them)
  will be downloaded from Genbank for further taxonomic classification
  on your local machine:\n""")
        printlog_info(
            "Following sequences (and all replicons related to them) \
will be downloaded from Genbank for further taxonomic classification \
on your local machine:")
        for i, acc in enumerate(acc_dict.keys()):
            printlog_info(" {}. {} - `{}`".format(i + 1, acc, acc_dict[acc]))
        # end for

        search_for_related_replicons(acc_dict)

        printlog_info_time("Completing taxonomy file...")
        for i, acc in enumerate(acc_dict.keys()):
            if not acc in tax_exist_accs:
                taxonomy.find_taxonomy(acc, acc_dict[acc][1], taxonomy_path)
            # end if
            # Accessions can be of different length
            printn("\r{} - {}: {}/{}".format(getwt(), acc, i +
                                             1, len(acc_dict)) + " " * 10 +
                   "\b" * 10)
        # end for
        print()
        printlog_info_time("Taxonomy file is consistent.")
    # end if

    local_fasta = os.path.join(
        db_dir, "local_seq_set.fasta")  # path to downloaded FASTA file

    add_lambda_phage(local_fasta,
                     taxonomy_path)  # add lambda phage control sequence

    retrieve_fastas_by_acc(
        acc_dict, db_dir, local_fasta)  # download main fasta data from GenBank

    # Add 'your own' fasta files to database
    if not len(your_own_fasta_lst) == 0:

        # This variable counts sequences from local files.
        # It is necessary for not allowing duplicated accessions.
        own_seq_counter = 0

        # Check if these files are assembly made by SPAdes or a5
        spades_patt = r">NODE_[0-9]+"  # this pattern will match sequence IDs generated y SPAdes
        a5_patt = r">scaffold_[0-9]+"  # this pattern will match sequence IDs generated y a5
        assemblies = list(
        )  # this list will contain paths to assembly files (SPAdes or a5)

        for own_fasta_path in reversed(your_own_fasta_lst):

            how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)]
            fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)]

            with how_to_open(own_fasta_path) as fasta_file:
                first_seq_id = fmt_func(fasta_file.readline(
                ))  # get the first line in file (the first seq ID)
            # end with

            # if we've got SPAdes assembly
            if not re.search(spades_patt, first_seq_id) is None:
                assemblies.append(own_fasta_path)
                # Remove these file from list -- they will be processed in a specific way
                your_own_fasta_lst.remove(own_fasta_path)
                continue
            # end if

            # if we've got a5 assembly
            if not re.search(a5_patt, first_seq_id) is None:
                assemblies.append(own_fasta_path)
                your_own_fasta_lst.remove(own_fasta_path)
                continue
            # end if
        # end for

        # Include assemblies files in multi-fasta file

        # Find common prefix of all assembly paths and remove it from assembly names
        if len(assemblies) > 1:
            assemblies_formatted = tuple(
                map(lambda f: os.path.abspath(f).replace(os.sep, '-'),
                    assemblies))
            common_prefix = find_common_prefix(assemblies_formatted)
            assemblies_formatted = tuple(
                map(lambda f: f.replace(common_prefix, ''),
                    assemblies_formatted))
        elif len(assemblies) > 0:
            common_prefix = ''
            assemblies_formatted = tuple(map(os.path.basename, assemblies))
        # end if

        # Add assembled sequences to database
        with open(local_fasta, 'a') as fasta_db:
            for i, assm_path in enumerate(assemblies):
                printlog_info("Adding `{}` to database...".format(
                    os.path.basename(assm_path)))
                assm_name_fmt = assemblies_formatted[i]

                how_to_open = OPEN_FUNCS[is_gzipped(assm_path)]
                fmt_func = FORMATTING_FUNCS[is_gzipped(assm_path)]
                with how_to_open(assm_path) as fasta_file:
                    for line in fasta_file:
                        line = fmt_func(line)
                        # You can find comments to "OWN_SEQ..." below.
                        # Paths will be written to seq IDs in following way:
                        #   some-happy-path.fastq--
                        # in order to retrieve them securely with regex later.
                        if line.startswith('>'):
                            own_seq_counter += 1
                            own_acc = "OWN_SEQ_{}".format(own_seq_counter)
                            own_def = "{}--".format(
                                assm_name_fmt.replace(common_prefix,
                                                      '')) + line[1:]
                            own_def = remove_bad_chars(own_def)
                            taxonomy.save_taxonomy_directly(
                                taxonomy_path, own_acc, own_def)
                            line = ">" + "{} {}".format(own_acc, own_def)
                        # end if
                        fasta_db.write(line + '\n')
                    # end for
                # end with
            # end for
        # end with

        with open(local_fasta, 'a') as fasta_db:
            for own_fasta_path in your_own_fasta_lst:
                printlog_info("Adding `{}` to database...".format(
                    os.path.basename(own_fasta_path)))

                how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)]
                fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)]
                with how_to_open(own_fasta_path) as fasta_file:
                    for line in fasta_file:
                        line = fmt_func(line)
                        # 'makeblastdb' considers first word (sep. is space) as sequence ID
                        #   and throws an error if there are duplicated IDs.
                        # In order not to allow this duplication we'll create our own sequence IDs:
                        #   'OWN_SEQ_<NUMBER>' and write it in the beginning of FASTA record name.
                        if line.startswith('>'):
                            own_seq_counter += 1
                            own_acc = "OWN_SEQ_{}".format(own_seq_counter)
                            taxonomy.save_taxonomy_directly(
                                taxonomy_path, own_acc, line[1:])
                            line = ">" + own_acc + ' ' + remove_bad_chars(
                                line[1:])
                        # end if
                        fasta_db.write(line + '\n')
                    # end for
                # end with
            # end for
        # end with
    # end if

    # 'lcl|ACCESSION...' entries can be given with '.1'
    #   (or '.2', whatever) terminus by blastn.
    # There is no '.1' terminus in taxonomy file.
    # Therefore we will prune accessions in advance.
    print()
    printn("{} - Formatting accessions...".format(getwt()))
    log_info("Formatting accessions...")
    corrected_path = os.path.join(db_dir, "corrected_seqs.fasta")
    with open(local_fasta, 'r') as source_file, open(corrected_path,
                                                     'w') as dest_file:
        for line in source_file:
            if line.startswith('>'):
                line = line.strip()
                acc, seq_name = (line.partition(' ')[0],
                                 line.partition(' ')[2])
                acc = acc.partition('.')[0]
                seq_name = remove_bad_chars(seq_name)
                seq_name = re.sub(r'[^\x00-\x7F]+', '_',
                                  seq_name)  # remove non-ascii chars
                line = ' '.join((acc, seq_name)) + '\n'
            # end if
            dest_file.write(line)
        # end for
    # end with
    os.unlink(local_fasta)
    os.rename(corrected_path, local_fasta)
    sys.stdout.write("\r{} - Formatting accessions... ok".format(getwt()))
    log_info("Formatting accessions done.")

    # Configure command line
    make_db_cmd = "makeblastdb -in {} -parse_seqids -dbtype nucl".format(
        local_fasta)
    exit_code = os.system(make_db_cmd)  # make a blast-format database
    if exit_code != 0:
        printlog_error_time("Error occured while making the database")
        platf_depend_exit(exit_code)
    # end if

    print("\033[1A{} - Database is successfully created: `{}`\n".format(
        getwt(), local_fasta))
    log_info("Database is successfully created: `{}`".format(local_fasta))

    if use_index == "true":
        printlog_info_time("Database index creating started")
        # Configure command line
        make_index_cmd = "makembindex -input {} -iformat blastdb -verbosity verbose".format(
            local_fasta)
        exit_code = os.system(
            make_index_cmd)  # create an index for the database
        if exit_code != 0:
            printlog_info_time("Error occured while creating database index")
            platf_depend_exit(exit_code)
        # end if

        printlog_info_time("Database index has been successfully created")
    # end if

    # Gzip downloaded FASTA file
    printlog_info_time("Gzipping FASTA file: `{}`".format(local_fasta))

    if gzip_util_found:
        os.system("{} -v {}".format(gzip_util, local_fasta))
    else:
        # form .fasta.gz file 'by hand'
        with open(local_fasta,
                  'rb') as fasta_file, open_as_gzip(local_fasta + ".gz",
                                                    "wb") as fagz_file:
            shutil_copyfileobj(fasta_file, fagz_file)
        # end with
        os.unlink(local_fasta)  # remove source FASTA file, not the database
    # end if

    return local_fasta