Exemplo n.º 1
0
def kraken2_full_add_lib(path_refseq, path_output):
    """ Build the hash table with the same genomes, but without binning, for comparison """
    # todo: adapt for centrifuge as well
    delete_folder_if_exists(path_output)
    create_path(path_output)
    add_file_with_parameters(
        path_output, add_description=f"no binning database for comparison")

    logger.warning(
        f"DO NOT INTERRUPT this process, you will have restart from scratches."
    )
    # Add genomes to
    for folder in os.scandir(path_refseq):
        if not osp.isdir(folder.path):
            continue
        if any([to_omit in folder.name for to_omit in main.omit_folders]):
            logger.info(f"skipping {folder.name}")
            continue
        else:
            cmd = [
                "find", folder.path, "-name", "'*.fna'", "-print0", "|",
                "xargs", "-P", f"{main.cores}", "-0", "-I{}", "-n1",
                "kraken2-build", "--add-to-library", "{}", "--db", path_output
            ]
            bash_process(" ".join(cmd), "adding genomes for kraken2 libraries")
Exemplo n.º 2
0
def scan_RefSeq_kmer_counts(scanning, folder_kmers, stop=-1):
    """ Scan through RefSeq, split genomes into segments, count their k-mer, save in similar structure
        Compatible with 2019 RefSeq format hopefully
    """
    create_path(folder_kmers)
    # scanning folder Class set up:
    # todo: change the kmer_count into the k_s_ notation
    ScanFolder.set_folder_scan_options(scanning=scanning,
                                       target=folder_kmers,
                                       ext_find=(".fastq", ".fq", ".fna"),
                                       ext_check=".taxon",
                                       ext_create=f".{main.k}mer_count.pd",
                                       skip_folders=main.omit_folders)

    logger.info(
        "scanning through all genomes in refseq to count kmer distributions " +
        scanning)

    # Count in parallel. islice() to take a part of an iterable
    Genome.set_k_kmers(main.k)
    with Pool(main.cores) as pool:
        results = list(
            tqdm(pool.imap(
                parallel_kmer_counting,
                islice(ScanFolder.tqdm_scan(with_tqdm=False),
                       stop if stop > 0 else None)),
                 total=ScanFolder.count_root_files(),
                 dynamic_ncols=True))

    logger.info(f"{len(results)} genomes have been scanned and kmer counted.")
Exemplo n.º 3
0
def create_n_folders(path, n, delete_existing=False):
    """ Create the sub-folders of bins from 0/ to n/ """
    logger.debug(f"creates {n} folder under {path}")
    for i in range(n):
        new_path = osp.join(path, str(i))
        if delete_existing and osp.isdir(new_path):
            shutil.rmtree(new_path)
        create_path(new_path)
Exemplo n.º 4
0
    def set_fastq_model_and_param(cls, path_fastq, path_model, param, force_binning):
        assert osp.isfile(path_fastq), FileNotFoundError(f"{path_fastq} cannot be found")
        # todo: load the parameter file from parse_DB.py instead of parsing string.... parameters_RefSeq_binning.txt
        cls.PARAM = param
        cls.FASTQ_PATH = path_fastq
        folder, file_base = osp.split(osp.splitext(path_fastq)[0])
        # output folder, will host one file for each bin
        cls.FASTQ_BIN_FOLDER = osp.join(folder, param)

        # Compute expected length
        cls.DIM_COMBINED = n_dim_rc_combined(K)

        cls.total_reads = reads_in_file(cls.FASTQ_PATH)

        # skip if reads already binned
        if osp.isdir(cls.FASTQ_BIN_FOLDER):
            total_binned_reads = 0
            if not force_binning:
                # Compute total reads count if it hasn't been forced
                for path in Path(cls.FASTQ_BIN_FOLDER).rglob("*bin-*.fastq"):
                    str_path = path.as_posix()
                    total_binned_reads += reads_in_file(str_path)
                    _, key, _ = re.split('.bin-|.fastq', str_path)
                    cls.outputs[int(key)] = str_path
                cls.logger.debug(f"A folder has been detected, and holds in total {total_binned_reads} reads, "
                                 f"compared to the {cls.total_reads} in the original fastq file.")

            if force_binning or cls.total_reads != total_binned_reads:
                last_modif = dt.fromtimestamp(osp.getmtime(cls.FASTQ_BIN_FOLDER))
                save_folder = f"{cls.FASTQ_BIN_FOLDER}_{last_modif:%Y-%m-%d_%H-%M}"
                cls.logger.warning(f"Folder existing, renaming to avoid losing files: {save_folder}")
                os.rename(cls.FASTQ_BIN_FOLDER, save_folder)
            else:
                # Flag up if read counts are equal, and no forcing to recount
                cls.file_has_been_binned = True
        create_path(cls.FASTQ_BIN_FOLDER)

        cls.FILEBASE = file_base
        if not path_model == "full":
            cls.KMER = kmers_dic(K)
            with open(path_model, 'rb') as f:
                cls.MODEL = pickle.load(f)
Exemplo n.º 5
0
def clustering_segments(path_kmer_counts,
                        output_pred,
                        path_model,
                        n_clusters,
                        model_name="minikm"):
    """ Given a database of segments of genomes in fastq files, split it in n clusters/bins """
    assert model_name in clustering_segments.models, f"model {model_name} is not implemented"
    # Paths
    create_path(output_pred)
    create_path(path_model)
    k = main.k
    w = main.w

    # https://www.codementor.io/@guidotournois/4-strategies-to-deal-with-large-datasets-using-pandas-qdw3an95k
    # filename = "data.csv"
    # n = sum(1 for line in open(filename)) - 1  # Calculate number of rows in file
    # s = n // 10  # sample size of 10%
    # skip = sorted(random.sample(range(1, n + 1), n - s))  # n+1 to compensate for header
    # df = pandas.read_csv(filename, skiprows=skip)

    path_pkl_kmer_counts = path_kmer_counts.replace(".csv", ".pd")
    if osp.isfile(path_pkl_kmer_counts):
        logger.info(
            f"Clustering the genomes' segments into {n_clusters} bins. Loading combined kmer counts "
            f"(file size: {osp.getsize(path_pkl_kmer_counts)/10**9:.2f} GB) ..."
        )
        df = pd.read_pickle(path_pkl_kmer_counts)
    else:
        logger.info(
            f"Clustering the genomes' segments into {n_clusters} bins. Loading combined kmer counts "
            f"(file size: {osp.getsize(path_kmer_counts)/10**9:.2f} GB) ...")
        df = pd.read_csv(path_kmer_counts, dtype=main.cols_types)
        logger.info(
            f"save pickle copy for faster loading {path_pkl_kmer_counts}")
        # Soon DEPRECATED, set my the loading type
        # (Need to set again as categories)
        df.category = df.category.astype('category')
        df.name = df.name.astype('category')
        df.fna_path = df.fna_path.astype('category')
        df.description = df.description.astype('category')
        df.to_pickle(path_pkl_kmer_counts)

    cols_kmers = df.columns[-4**k:]
    cols_spe = df.columns[:-4**k]
    logger.debug(f"cols_kmers={cols_kmers[:5]} {cols_kmers[-5:]}")

    # ## 1 ## Scaling by length and kmers
    df_mem = df.memory_usage(deep=False).sum()
    logger.info(
        f"Kmer counts loaded, scaling the values to the length of the segments. "
        f"DataFrame size: {df_mem/10**9:.2f} GB - shape: {df.shape}")

    # todo: save intermediate data
    scale_df_by_length(df, cols_kmers, k, w)

    # ## 2 ## Could add PCA

    # Model learning
    logger.info(f"Data takes {df_mem/10**9:.2f} GB. Training {model_name}...")
    if model_name == "kmeans":
        ml_model = KMeans(n_clusters=n_clusters,
                          n_jobs=main.cores,
                          random_state=3)
    elif model_name == "minikm":
        ml_model = MiniBatchKMeans(n_clusters=n_clusters,
                                   random_state=3,
                                   batch_size=1000,
                                   max_iter=100)
    else:
        logger.error(f"No model defined for {model_name}.")
        raise NotImplementedError

    ml_model.fit(df[cols_kmers])

    # Model saving
    with open(path_model, 'wb') as f:
        pickle.dump(ml_model, f)
    logger.info(
        f"{model_name} model saved for k={k} s={w} at {path_model}, now predicting bins for each segment..."
    )

    # ## 3 ##
    predicted = ml_model.predict(df[cols_kmers])
    df["cluster"] = predicted

    df[list(cols_spe) + ["cluster"]].to_pickle(output_pred)
    logger.info(
        f"Defined {n_clusters} clusters, assignments here: {output_pred} with ML model {model_name}."
    )
    return