示例#1
0
#############################################################################
Sylvain @ GIS / Biopolis / Singapore
Sylvain Jun-Zhe RIONDET <*****@*****.**>
Started on 2019-12-11
Reads Binning Project
#############################################################################
common resources for biology related functions and Classes
"""
import traceback

# todo: check if this logger works
import ete3.ncbi_taxonomy

from plot_me.tools import init_logger

logger = init_logger("bio")


# #############################################################################
# Methods for nucleotides manipulations
nucleotides = "ACGT"


def kmers_dic(n, choice=nucleotides):
    return {a: 0.0 for a in combinaisons(choice, n)}


def combinaisons(combi, n, instances=nucleotides):
    if n == 1:
        return combi
    else:
示例#2
0
文件: parse_DB.py 项目: CSB5/PLoT-ME
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
# from Bio.Seq import Seq
from sklearn.cluster import KMeans, MiniBatchKMeans
# from sklearn.decomposition import PCA

from tqdm import tqdm

# Import paths and constants for the whole project
from plot_me import LOGS
from plot_me.tools import ScanFolder, is_valid_directory, init_logger, create_path, scale_df_by_length, \
    time_to_hms, delete_folder_if_exists, bash_process, f_size
from plot_me.bio import kmers_dic, ncbi, seq_count_kmer, combinaisons, nucleotides

logger = init_logger('parse_DB')
CLASSIFIERS = (('kraken2', 'k', '35', 'l', '31', 's', '7'), ("centrifuge", ))


class Genome:
    """ Genome from RefSeq. Methods to split it into plasmid/genome and into segments
        SET K BEFORE ANY INSTANCE IS CREATED, with set_k_kmers()
    """
    categories = [
        "plasmid",
        "chloroplast",
        "scaffold",
        "contig",
        "chromosome",
        "complete genome",
        "whole genome shotgun sequence",
示例#3
0
"""
First attempt to add test to project, using pytest
python3 -m pytest -v
python3 -m pytest -v --color=yes --log-level=5 | less -r

Testing both plot_me.bio and plot_me.cython_module.cyt_ext
"""
from plot_me import bio
from plot_me.cython_module import cyt_ext
from plot_me.tools import init_logger

import logging
import numpy as np
import pytest

_ = init_logger(__package__)
logger = logging.getLogger(__name__)
cyt_ext.set_verbosity(5)

# ######################    TESTING COMBINATIONS    ######################
combinations = [
    (2, [
        "AA",
        "AC",
        "AG",
        "AT",
        "CA",
        "CC",
        "CG",
        "CT",
        "GA",
示例#4
0
def bin_classify(list_fastq, path_report, path_database, classifier, full_DB=False, threads=cpu_count(),
                 f_record="~/logs/classify_records.csv", clf_settings="", drop_bin_threshold=DROP_BIN_THRESHOLD,
                 skip_clas=False, force_binning=False, no_cython=False):
    """ Should load a file, do all the processing """
    _ = init_logger(__package__)  # initialize the global logger
    logger.info("\n*********************************************************************************************************")
    logger.info("**** Starting script **** \n ")
    global THREADS
    THREADS = threads
    if not no_cython:
        global cyt_ext, cython_is_there
        cyt_ext, cython_is_there = import_cython_mod()

    # preparing csv record file
    if not osp.isfile(f_record):
        os.makedirs(osp.dirname(f_record), exist_ok=True)
        with open(f_record, 'w', newline='') as csv_file:
            csv_writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            headers = ("FILE", "BINS_vs_FULL", "BINNING", "CLASSIFY", "TOTAL", "HASHES_SIZE", "NB_BINS", "HASH_PATH", "HASH_NAME")
            csv_writer.writerow(headers)

    logger.info("let's classify reads!")

    # Find the model
    global K, BIN_NB, DROP_BIN_THRESHOLD
    if full_DB:
        path_model = "full"
        K          = 0
        BIN_NB     = 1
        # clusterer, bin_nb, k, w, omitted = (None, 1, None, None, None)
        path_to_hash = path_database
        if "hash.k2d" in path_to_hash:
            path_to_hash = osp.dirname(path_to_hash)
        if "hash.k2d" not in os.listdir(path_to_hash):
            FileNotFoundError(f"hash.k2d not found in folder: {path_to_hash}")
    else:
        path_model = ""
        for file in os.scandir(path_database):
            if file.name.startswith("model.") and file.name.endswith(".pkl"):
                path_model = file.path
                break
        assert osp.isfile(path_model), FileNotFoundError(f"didn't find the ML model in {path_database}... {path_model}")

        # Parse the model name to find parameters:
        basename = path_model.split("/model.")[1]
        clusterer, bin_nb, k, w, omitted, _ = re.split('_b|_k|_s|_o|.pkl', basename)
        K      = int(k)
        BIN_NB = int(bin_nb)
        DROP_BIN_THRESHOLD = drop_bin_threshold if drop_bin_threshold != -1 else 1. / BIN_NB
        path_to_hash = osp.join(path_database, classifier, clf_settings)
        logger.debug(f"path_to_hash: {path_to_hash}")
        logger.debug(f"Found parameters: clusterer={clusterer}, bin number={BIN_NB}, k={K}, w={w}, omitted={omitted}")
        if cython_is_there:
            cyt_ext.set_verbosity(logging.INFO)
            cyt_ext.init_variables(K)

    # Set the folder with hash tables
    param = osp.basename(path_database)
    if param == "": param = osp.basename(path_database[:-1])
    logger.info(f"Assuming parameters are: {param}")

    t = {}  # recording time at each step
    for i, file in enumerate(list_fastq):
        try:
            assert osp.isfile(file), FileNotFoundError(f"file number {i} not found: {file}")
            if file.lower().endswith(".fastq"):
                bin_classify.format = "fastq"
            elif file.lower().endswith(".fasta"):
                bin_classify.format = "fasta"
            else:
                raise NotImplementedError("The file is neither ending with .fasta nor with .fastq")
            # setting time
            base_name = osp.basename(file)
            key = base_name
            t[key] = {}
            t[key]["start"] = perf_counter()

            logger.info(f"Opening fastq file ({i+1}/{len(list_fastq)}) {f_size(file)}, {base_name}")
            # Binning
            if not full_DB:
                ReadToBin.set_fastq_model_and_param(file, path_model, param, force_binning)
                ReadToBin.bin_reads()
                ReadToBin.sort_bins_by_sizes_and_drop_smalls()
                t[key]["binning"] = perf_counter()
                t[key]["reads_nb"] = ReadToBin.NUMBER_BINNED

            if not skip_clas:
                fastq_classifier = MockCommunity(
                    path_original_fastq=file, db_path=path_to_hash, full_DB=full_DB, folder_report=path_report,
                    path_binned_fastq=ReadToBin.outputs, classifier_name=classifier, param=param)

                fastq_classifier.classify()
                t[key]["classify"] = perf_counter()
                t[key]["hashes"] = fastq_classifier.hash_size
            # todo: process reports to have one clean one

        except Exception as e:
            logger.exception(e)
            logger.error(f"script crashed for file: {file}")

    records = []
    for key in t.keys():
        if 'classify' not in t[key].keys():
            break
        if "binning" in t[key]:
            t_binning = time_to_hms(t[key]['start'], t[key]['binning'], short=True)
            t_classify = time_to_hms(t[key]['binning'], t[key]['classify'], short=True)
            t_total = time_to_hms(t[key]['start'], t[key]['classify'], short=True)
            hashes = t[key]["hashes"]
            h_size = sum(hashes.values())

            logger.info(f"timings for file {key} / binning : {t_binning}, for {t[key]['reads_nb']} reads")
            logger.info(f"timings for file {key} / classify: {t_classify}, "
                        f"{len(hashes)} bins, total size of hashes loaded: {f_size(h_size)}")
        else:
            t_binning = time_to_hms(t[key]['start'], t[key]['start'], short=True)
            t_classify = time_to_hms(t[key]['start'], t[key]['classify'], short=True)
            t_total = t_classify
            hashes = t[key]["hashes"]
            h_size = sum(hashes.values())
            logger.info(f"timings for file {key} / classify: {time_to_hms(t[key]['start'], t[key]['classify'])}")

        # to CSV
        # todo: add precision / sensitivity / abundance
        row = (key, "full" if full_DB else "bins", t_binning, t_classify, t_total, f"{h_size / 10 ** 9:.2f}GB", f"{len(hashes)}",
               path_database, osp.basename(path_database))
        records.append(row)

    # Timings and to csv
    with open(f_record, 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        csv_writer.writerows(records)

    logger.info(f"Script ended, {len(t)} files processed \n")
示例#5
0
def main(folder_database,
         folder_output,
         n_clusters,
         k,
         window,
         cores=cpu_count(),
         skip_existing="111110",
         early_stop=len(check_step.can_skip) - 1,
         omit_folders=("plant", "vertebrate"),
         path_taxonomy="",
         full_DB=False,
         k2_clean=False,
         ml_model=clustering_segments.models[0],
         classifier_param=CLASSIFIERS[0],
         verbose_lvl=30,
         no_cython=False,
         combine_rc=True):
    """ Pre-processing of RefSeq database to split genomes into windows, then count their k-mers
        Second part, load all the k-mer counts into one single Pandas dataframe
        Third train a clustering algorithm on the k-mer frequencies of these genomes' windows
        folder_database : RefSeq root folder
        folder_output   : empty root folder to store kmer counts
    """
    _ = init_logger(__package__)  # initialize the global logger
    logger.info(
        "\n*********************************************************************************************************"
    )
    logger.info("**** Starting script **** \n ")
    try:
        if not no_cython:
            global cyt_ext, cython_is_there
            cyt_ext, cython_is_there = import_cython_mod()
            if cython_is_there:
                logger.info(f"Cython is available, initializing variables")
                cyt_ext.set_verbosity(verbose_lvl)
                cyt_ext.init_variables(k)
        else:
            logger.info(f"Falling back on pure python")
        # Common folder name keeping parameters
        param_k_s = f"k{k}_s{window}"
        o_omitted = "" if len(
            omit_folders) == 0 else "o" + "-".join(omit_folders)
        folder_intermediate_files = osp.join(folder_output, param_k_s,
                                             "kmer_counts")
        # Parameters
        main.folder_database = folder_database
        main.omit_folders = omit_folders
        main.k = k
        main.w = window
        main.cores = cores
        # Set all columns type
        cols_types = {
            "taxon": int,
            "category": 'category',
            "start": int,
            "end": int,
            "name": 'category',
            "description": 'category',
            "fna_path": 'category',
        }
        codons = codons_without_rev_comp(
            main.k) if combine_rc else combinations(main.k)
        for key in codons:
            cols_types[key] = float32
        main.cols_types = cols_types

        check_step.timings = [
            perf_counter(),
        ]  # log time spent
        check_step.step_nb = 0  # For decorator to know which steps has been
        check_step.early_stop = early_stop
        check_step.can_skip = skip_existing  # Set the skip variable for the decorator of each step
        # Check classifier/kraken2's parameters
        param, s_param = classifier_param_checker(classifier_param)
        # Check that taxonomy wasn't forgotten
        if '0' in check_step.can_skip[5:] and check_step.early_stop >= 5:
            assert osp.isdir(path_taxonomy), NotADirectoryError

        if full_DB:
            # Run kraken2 on the full RefSeq, without binning, for reference
            path_full_hash = osp.join(folder_output, "no-binning", o_omitted,
                                      param['name'], s_param)
            kraken2_full_add_lib(folder_database, path_full_hash)
            kraken2_full_build_hash(path_taxonomy, path_full_hash, param)
            if k2_clean: kraken2_clean(path_full_hash, 1)

        else:
            #    KMER COUNTING
            # get kmer distribution for each window of each genome, parallel folder with same structure
            path_individual_kmer_counts = osp.join(folder_intermediate_files,
                                                   f"counts.k{k}_s{window}")
            scan_RefSeq_kmer_counts(folder_database,
                                    path_individual_kmer_counts)

            # combine all kmer distributions into one single file
            path_stacked_kmer_counts = osp.join(
                folder_intermediate_files,
                f"all-counts.k{k}_s{window}_{o_omitted}.csv")
            append_genome_kmer_counts(path_individual_kmer_counts,
                                      path_stacked_kmer_counts)

            #    CLUSTERING
            # From kmer distributions, use clustering to set the bins per segment
            string_param = f"{ml_model}_b{n_clusters}_k{main.k}_s{main.w}_{o_omitted}"
            folder_by_model = osp.join(folder_output, param_k_s, string_param)
            path_model = osp.join(folder_by_model, f"model.{string_param}.pkl")
            path_segments_clustering = osp.join(
                folder_by_model, f"segments-clustered.{string_param}.pd")
            clustering_segments(path_stacked_kmer_counts,
                                path_segments_clustering, path_model,
                                n_clusters, ml_model)

            #    CREATING THE DATABASES
            # create the DB for each bin (copy parts of each .fna genomes into a folder with taxonomy id)
            path_refseq_binned = osp.join(folder_by_model, f"RefSeq_binned")
            split_genomes_to_bins(path_segments_clustering, path_refseq_binned,
                                  n_clusters)

            # Run kraken2-build add libray
            path_bins_hash = osp.join(folder_by_model, param['name'], s_param)
            add_library(path_refseq_binned, path_bins_hash, n_clusters,
                        param['name'])

            # Run kraken2-build make hash tables
            build_indexes(path_taxonomy, path_bins_hash, n_clusters, param)

            # Cleaning
            if k2_clean and "kraken2" in param['name']:
                kraken2_clean(path_bins_hash, n_clusters)

    except KeyboardInterrupt:
        check_step.timings.append(perf_counter(
        ))  # log time for the last step that has been interrupted
        logger.error("User interrupted")
        logger.error(traceback.format_exc())
    except Exception as e:
        check_step.timings.append(perf_counter(
        ))  # log time for the last step that has been interrupted
        logger.error(f"Trace the log file there: {LOGS}")
        logger.exception(e)

    finally:
        # End
        times = check_step.timings
        for i in range(len(times) - 1):
            logger.info(
                f"timing for STEP {i} - {time_to_hms(times[i], times[i+1])}")
        logger.info(
            f"Script ended, total time of {time_to_hms(times[0], perf_counter())}. \n"
        )