############################################################################# Sylvain @ GIS / Biopolis / Singapore Sylvain Jun-Zhe RIONDET <*****@*****.**> Started on 2019-12-11 Reads Binning Project ############################################################################# common resources for biology related functions and Classes """ import traceback # todo: check if this logger works import ete3.ncbi_taxonomy from plot_me.tools import init_logger logger = init_logger("bio") # ############################################################################# # Methods for nucleotides manipulations nucleotides = "ACGT" def kmers_dic(n, choice=nucleotides): return {a: 0.0 for a in combinaisons(choice, n)} def combinaisons(combi, n, instances=nucleotides): if n == 1: return combi else:
from Bio import SeqIO from Bio.SeqRecord import SeqRecord # from Bio.Seq import Seq from sklearn.cluster import KMeans, MiniBatchKMeans # from sklearn.decomposition import PCA from tqdm import tqdm # Import paths and constants for the whole project from plot_me import LOGS from plot_me.tools import ScanFolder, is_valid_directory, init_logger, create_path, scale_df_by_length, \ time_to_hms, delete_folder_if_exists, bash_process, f_size from plot_me.bio import kmers_dic, ncbi, seq_count_kmer, combinaisons, nucleotides logger = init_logger('parse_DB') CLASSIFIERS = (('kraken2', 'k', '35', 'l', '31', 's', '7'), ("centrifuge", )) class Genome: """ Genome from RefSeq. Methods to split it into plasmid/genome and into segments SET K BEFORE ANY INSTANCE IS CREATED, with set_k_kmers() """ categories = [ "plasmid", "chloroplast", "scaffold", "contig", "chromosome", "complete genome", "whole genome shotgun sequence",
""" First attempt to add test to project, using pytest python3 -m pytest -v python3 -m pytest -v --color=yes --log-level=5 | less -r Testing both plot_me.bio and plot_me.cython_module.cyt_ext """ from plot_me import bio from plot_me.cython_module import cyt_ext from plot_me.tools import init_logger import logging import numpy as np import pytest _ = init_logger(__package__) logger = logging.getLogger(__name__) cyt_ext.set_verbosity(5) # ###################### TESTING COMBINATIONS ###################### combinations = [ (2, [ "AA", "AC", "AG", "AT", "CA", "CC", "CG", "CT", "GA",
def bin_classify(list_fastq, path_report, path_database, classifier, full_DB=False, threads=cpu_count(), f_record="~/logs/classify_records.csv", clf_settings="", drop_bin_threshold=DROP_BIN_THRESHOLD, skip_clas=False, force_binning=False, no_cython=False): """ Should load a file, do all the processing """ _ = init_logger(__package__) # initialize the global logger logger.info("\n*********************************************************************************************************") logger.info("**** Starting script **** \n ") global THREADS THREADS = threads if not no_cython: global cyt_ext, cython_is_there cyt_ext, cython_is_there = import_cython_mod() # preparing csv record file if not osp.isfile(f_record): os.makedirs(osp.dirname(f_record), exist_ok=True) with open(f_record, 'w', newline='') as csv_file: csv_writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) headers = ("FILE", "BINS_vs_FULL", "BINNING", "CLASSIFY", "TOTAL", "HASHES_SIZE", "NB_BINS", "HASH_PATH", "HASH_NAME") csv_writer.writerow(headers) logger.info("let's classify reads!") # Find the model global K, BIN_NB, DROP_BIN_THRESHOLD if full_DB: path_model = "full" K = 0 BIN_NB = 1 # clusterer, bin_nb, k, w, omitted = (None, 1, None, None, None) path_to_hash = path_database if "hash.k2d" in path_to_hash: path_to_hash = osp.dirname(path_to_hash) if "hash.k2d" not in os.listdir(path_to_hash): FileNotFoundError(f"hash.k2d not found in folder: {path_to_hash}") else: path_model = "" for file in os.scandir(path_database): if file.name.startswith("model.") and file.name.endswith(".pkl"): path_model = file.path break assert osp.isfile(path_model), FileNotFoundError(f"didn't find the ML model in {path_database}... {path_model}") # Parse the model name to find parameters: basename = path_model.split("/model.")[1] clusterer, bin_nb, k, w, omitted, _ = re.split('_b|_k|_s|_o|.pkl', basename) K = int(k) BIN_NB = int(bin_nb) DROP_BIN_THRESHOLD = drop_bin_threshold if drop_bin_threshold != -1 else 1. / BIN_NB path_to_hash = osp.join(path_database, classifier, clf_settings) logger.debug(f"path_to_hash: {path_to_hash}") logger.debug(f"Found parameters: clusterer={clusterer}, bin number={BIN_NB}, k={K}, w={w}, omitted={omitted}") if cython_is_there: cyt_ext.set_verbosity(logging.INFO) cyt_ext.init_variables(K) # Set the folder with hash tables param = osp.basename(path_database) if param == "": param = osp.basename(path_database[:-1]) logger.info(f"Assuming parameters are: {param}") t = {} # recording time at each step for i, file in enumerate(list_fastq): try: assert osp.isfile(file), FileNotFoundError(f"file number {i} not found: {file}") if file.lower().endswith(".fastq"): bin_classify.format = "fastq" elif file.lower().endswith(".fasta"): bin_classify.format = "fasta" else: raise NotImplementedError("The file is neither ending with .fasta nor with .fastq") # setting time base_name = osp.basename(file) key = base_name t[key] = {} t[key]["start"] = perf_counter() logger.info(f"Opening fastq file ({i+1}/{len(list_fastq)}) {f_size(file)}, {base_name}") # Binning if not full_DB: ReadToBin.set_fastq_model_and_param(file, path_model, param, force_binning) ReadToBin.bin_reads() ReadToBin.sort_bins_by_sizes_and_drop_smalls() t[key]["binning"] = perf_counter() t[key]["reads_nb"] = ReadToBin.NUMBER_BINNED if not skip_clas: fastq_classifier = MockCommunity( path_original_fastq=file, db_path=path_to_hash, full_DB=full_DB, folder_report=path_report, path_binned_fastq=ReadToBin.outputs, classifier_name=classifier, param=param) fastq_classifier.classify() t[key]["classify"] = perf_counter() t[key]["hashes"] = fastq_classifier.hash_size # todo: process reports to have one clean one except Exception as e: logger.exception(e) logger.error(f"script crashed for file: {file}") records = [] for key in t.keys(): if 'classify' not in t[key].keys(): break if "binning" in t[key]: t_binning = time_to_hms(t[key]['start'], t[key]['binning'], short=True) t_classify = time_to_hms(t[key]['binning'], t[key]['classify'], short=True) t_total = time_to_hms(t[key]['start'], t[key]['classify'], short=True) hashes = t[key]["hashes"] h_size = sum(hashes.values()) logger.info(f"timings for file {key} / binning : {t_binning}, for {t[key]['reads_nb']} reads") logger.info(f"timings for file {key} / classify: {t_classify}, " f"{len(hashes)} bins, total size of hashes loaded: {f_size(h_size)}") else: t_binning = time_to_hms(t[key]['start'], t[key]['start'], short=True) t_classify = time_to_hms(t[key]['start'], t[key]['classify'], short=True) t_total = t_classify hashes = t[key]["hashes"] h_size = sum(hashes.values()) logger.info(f"timings for file {key} / classify: {time_to_hms(t[key]['start'], t[key]['classify'])}") # to CSV # todo: add precision / sensitivity / abundance row = (key, "full" if full_DB else "bins", t_binning, t_classify, t_total, f"{h_size / 10 ** 9:.2f}GB", f"{len(hashes)}", path_database, osp.basename(path_database)) records.append(row) # Timings and to csv with open(f_record, 'a', newline='') as csv_file: csv_writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) csv_writer.writerows(records) logger.info(f"Script ended, {len(t)} files processed \n")
def main(folder_database, folder_output, n_clusters, k, window, cores=cpu_count(), skip_existing="111110", early_stop=len(check_step.can_skip) - 1, omit_folders=("plant", "vertebrate"), path_taxonomy="", full_DB=False, k2_clean=False, ml_model=clustering_segments.models[0], classifier_param=CLASSIFIERS[0], verbose_lvl=30, no_cython=False, combine_rc=True): """ Pre-processing of RefSeq database to split genomes into windows, then count their k-mers Second part, load all the k-mer counts into one single Pandas dataframe Third train a clustering algorithm on the k-mer frequencies of these genomes' windows folder_database : RefSeq root folder folder_output : empty root folder to store kmer counts """ _ = init_logger(__package__) # initialize the global logger logger.info( "\n*********************************************************************************************************" ) logger.info("**** Starting script **** \n ") try: if not no_cython: global cyt_ext, cython_is_there cyt_ext, cython_is_there = import_cython_mod() if cython_is_there: logger.info(f"Cython is available, initializing variables") cyt_ext.set_verbosity(verbose_lvl) cyt_ext.init_variables(k) else: logger.info(f"Falling back on pure python") # Common folder name keeping parameters param_k_s = f"k{k}_s{window}" o_omitted = "" if len( omit_folders) == 0 else "o" + "-".join(omit_folders) folder_intermediate_files = osp.join(folder_output, param_k_s, "kmer_counts") # Parameters main.folder_database = folder_database main.omit_folders = omit_folders main.k = k main.w = window main.cores = cores # Set all columns type cols_types = { "taxon": int, "category": 'category', "start": int, "end": int, "name": 'category', "description": 'category', "fna_path": 'category', } codons = codons_without_rev_comp( main.k) if combine_rc else combinations(main.k) for key in codons: cols_types[key] = float32 main.cols_types = cols_types check_step.timings = [ perf_counter(), ] # log time spent check_step.step_nb = 0 # For decorator to know which steps has been check_step.early_stop = early_stop check_step.can_skip = skip_existing # Set the skip variable for the decorator of each step # Check classifier/kraken2's parameters param, s_param = classifier_param_checker(classifier_param) # Check that taxonomy wasn't forgotten if '0' in check_step.can_skip[5:] and check_step.early_stop >= 5: assert osp.isdir(path_taxonomy), NotADirectoryError if full_DB: # Run kraken2 on the full RefSeq, without binning, for reference path_full_hash = osp.join(folder_output, "no-binning", o_omitted, param['name'], s_param) kraken2_full_add_lib(folder_database, path_full_hash) kraken2_full_build_hash(path_taxonomy, path_full_hash, param) if k2_clean: kraken2_clean(path_full_hash, 1) else: # KMER COUNTING # get kmer distribution for each window of each genome, parallel folder with same structure path_individual_kmer_counts = osp.join(folder_intermediate_files, f"counts.k{k}_s{window}") scan_RefSeq_kmer_counts(folder_database, path_individual_kmer_counts) # combine all kmer distributions into one single file path_stacked_kmer_counts = osp.join( folder_intermediate_files, f"all-counts.k{k}_s{window}_{o_omitted}.csv") append_genome_kmer_counts(path_individual_kmer_counts, path_stacked_kmer_counts) # CLUSTERING # From kmer distributions, use clustering to set the bins per segment string_param = f"{ml_model}_b{n_clusters}_k{main.k}_s{main.w}_{o_omitted}" folder_by_model = osp.join(folder_output, param_k_s, string_param) path_model = osp.join(folder_by_model, f"model.{string_param}.pkl") path_segments_clustering = osp.join( folder_by_model, f"segments-clustered.{string_param}.pd") clustering_segments(path_stacked_kmer_counts, path_segments_clustering, path_model, n_clusters, ml_model) # CREATING THE DATABASES # create the DB for each bin (copy parts of each .fna genomes into a folder with taxonomy id) path_refseq_binned = osp.join(folder_by_model, f"RefSeq_binned") split_genomes_to_bins(path_segments_clustering, path_refseq_binned, n_clusters) # Run kraken2-build add libray path_bins_hash = osp.join(folder_by_model, param['name'], s_param) add_library(path_refseq_binned, path_bins_hash, n_clusters, param['name']) # Run kraken2-build make hash tables build_indexes(path_taxonomy, path_bins_hash, n_clusters, param) # Cleaning if k2_clean and "kraken2" in param['name']: kraken2_clean(path_bins_hash, n_clusters) except KeyboardInterrupt: check_step.timings.append(perf_counter( )) # log time for the last step that has been interrupted logger.error("User interrupted") logger.error(traceback.format_exc()) except Exception as e: check_step.timings.append(perf_counter( )) # log time for the last step that has been interrupted logger.error(f"Trace the log file there: {LOGS}") logger.exception(e) finally: # End times = check_step.timings for i in range(len(times) - 1): logger.info( f"timing for STEP {i} - {time_to_hms(times[i], times[i+1])}") logger.info( f"Script ended, total time of {time_to_hms(times[0], perf_counter())}. \n" )