def branch( json_file: Path = typer.Option(..., "-j", "--json"), force: bool = typer.Option(False, help="force overwrite."), loglevel: LogLevel = LogLevel.INFO, name: str = typer.Argument(..., help="New name of branched project."), ): """ Branch to create a new named project JSON file. kmerkit branch -j test.json test2 """ typer.secho( "branch: write kmers and/or counts to a file.", fg=typer.colors.MAGENTA, bold=False, ) set_loglevel(loglevel) try: name = name.strip(".json") proj = Project.parse_file(json_file).dict() proj['name'] = name new_json_file = os.path.join(os.path.dirname(json_file), name + ".json") if os.path.exists(new_json_file): if not force: msg = "JSON file already exists. Use force." logger.error(msg) raise KmerkitError(msg) with open(new_json_file, 'w') as out: out.write(Project(**proj).json(indent=4)) logger.info(f"wrote new branched project to {new_json_file}") except KmerkitError: typer.Abort()
def kdump( json_file: Path = typer.Option(..., "-j", "--json"), min_depth: int = typer.Option(1, help="filter to >= min-depth"), max_depth: int = typer.Option(100000, help="filter to <= max-depth"), write_kmers: bool = typer.Option(True), write_counts: bool = typer.Option(True), loglevel: LogLevel = LogLevel.INFO, samples: List[str] = typer.Argument( ..., help="One or more sample names in kcount database"), ): """ Write kmers and/or counts to a file from a KMC database. kmerkit dump -j test.json --min-depth 5 sample1 """ typer.secho( "dump: write kmers and/or counts to a file.", fg=typer.colors.MAGENTA, bold=False, ) set_loglevel(loglevel) try: Kdump( json_file, samples, min_depth, max_depth, write_kmers, write_counts, ).run() except KmerkitError: typer.Abort()
def trim( json_file: Path = typer.Option(..., "-j", "--json"), subsample: float = typer.Option(None, help="subsample to N reads"), workers: int = typer.Option(None, help="N worker processes"), # threads: int = typer.Option(None, help="N threads per worker"), force: bool = typer.Option(False, help="overwrite existing"), loglevel: LogLevel = LogLevel.INFO, ): """ Trim, filter, or subsample reads using fastp. kmerkit trim -j test.json --subsample 1000000 --cores 20 """ typer.secho( "trim: trim and filter reads using fastp (default settings)", fg=typer.colors.MAGENTA, bold=False, ) set_loglevel(loglevel) try: ktr = Ktrim(json_file=json_file, subsample=subsample) ktr.run(force=force, workers=workers) #, threads=threads) except KmerkitError: typer.Abort()
def count( json_file: Path = typer.Option(..., "-j", "--json"), kmer_size: int = typer.Option(17, "-k", "--kmer-size", min=2), min_depth: int = typer.Option(1, min=1), max_depth: int = typer.Option(int(1e9), min=1), max_count: int = typer.Option(255), canonical: bool = typer.Option(False), workers: int = typer.Option(1, help="N worker processes"), threads: int = typer.Option(None, help="N threads per worker"), force: bool = typer.Option(False, help="overwrite existing"), max_ram_per_worker: int = typer.Option(12, help="max RAM in Gb"), loglevel: LogLevel = LogLevel.INFO, ): """ Count kmers in fastq/a files using KMC. kcount will write kmer database files for each sample to <workdir>/<name>_kcount_.kmc_[suf,pre]. Example: kmerkit count -j test.json --kmer-size 35 --min-depth 5 """ # report the module typer.secho( "count: counting kmers from fastq/a files using KMC", fg=typer.colors.MAGENTA, bold=False, ) # set the loglevel set_loglevel(loglevel) typer.secho( f"loglevel: {loglevel}, logfile: STDERR", fg=typer.colors.MAGENTA, bold=False, ) # run the command counter = Kcount( str(json_file), kmer_size=kmer_size, min_depth=min_depth, max_depth=max_depth, max_count=max_count, canonical=canonical, ) # print(counter.statsdf.T) try: counter.run( threads=threads, workers=workers, force=force, max_ram=max_ram_per_worker, ) except KmerkitError as exc: typer.Abort(exc)
def init( name: str = typer.Option("test", "-n", "--name", help="Project name prefix"), workdir: str = typer.Option(tempfile.gettempdir(), "-w", "--workdir", help="Project directory"), delim: str = typer.Option("_", help="sample name delimiter"), loglevel: LogLevel = typer.Option(LogLevel.INFO, help="logging level"), force: bool = typer.Option(False, help="overwrite existing"), data: List[Path] = typer.Argument( ..., show_default=False, exists=True, # <- prob should be false for moving json files. dir_okay=True, file_okay=True, resolve_path=False, allow_dash=True, help=("File path(s) to input fastq/a data files")), ): """ Initialize a kmerkit project from fastq/a input files. Creates a JSON project file in <workdir>/<name>.json. Sample names are parsed from input filenames by splitting on the last occurrence of the optional 'delim' character (default is '_'). Paired reads are automatically detected from _R1 and _R2 in names. Multiple files can be selected using regular expressions for the data filepath input, or by listing multiple filepaths. Examples: kmerkit init -n test -w /tmp ./data/fastqs/*.gz\n kmerkit init -n test -w /tmp ./data-1/A.fastq ./data-2/B.fastq """ # parse the fastq_dict from string set_loglevel(loglevel) fastq_dict = get_fastq_dict_from_path(None, data, delim) try: init_project(name=name, workdir=workdir, fastq_dict=fastq_dict, force=force) except KmerkitError: typer.Exit()
def extract( json_file: Path = typer.Option(..., '-j', '--json'), min_kmers_per_read: int = typer.Option(1), paired_union: bool = typer.Option(True), loglevel: LogLevel = LogLevel.INFO, force: bool = typer.Option(False, help="overwrite existing"), workers: int = typer.Option(1, help="N worker processes"), threads: int = typer.Option(None, help="N threads per worker"), samples: Optional[List[str]] = typer.Argument(None), ): """ Extract reads from fastq/a files containing target kmers. Reads must contain at least 'min-kmers-per-read' kmers in them. If 'keep-paired' then reads are returns as paired-end. Samples can be entered as arguments in three possible ways: (1) enter sample names that are in the init database; (2) enter an integer for group0 or group1 from the kfilter database; (3) enter a file path to one or more fastq files. kmerkit extract -j test.json A B C D # select from init\n kmerkit extract -j test.json 1 # select from filter group\n kmerkit extract -j test.json ./data/*.gz # select new files\n """ typer.secho( "extract: extract reads containing target kmers", fg=typer.colors.MAGENTA, bold=False, ) set_loglevel(loglevel) try: kex = Kextract( json_file=json_file, samples=samples, min_kmers_per_read=min_kmers_per_read, paired_union=paired_union, ) kex.run(force=force, workers=workers, threads=threads) except KmerkitError: typer.Abort() except KeyboardInterrupt: typer.Abort("interrupted")
def kfilter( json_file: Path = typer.Option(..., '-j', '--json'), group0: Optional[List[str]] = typer.Option(None, '--group0', '-0'), group1: Optional[List[str]] = typer.Option(None, '--group1', '-1'), traits_file: Optional[Path] = typer.Option(None, '--traits-file'), min_cov: float = typer.Option(0.0), min_map: Tuple[float, float] = typer.Option((0.0, 1.0)), max_map: Tuple[float, float] = typer.Option((0.0, 1.0)), loglevel: LogLevel = LogLevel.INFO, force: bool = typer.Option(False, help="overwrite existing"), # min_map_canon ): """ Filter kmers based on frequencies among grouped samples. The filter kmers (group 0) are subtracted from the target kmers (group 1) to find the final target kmer set. You must enter sample names (or regex patterns) to --group0 and --group1 to assign samples to each group. Kmers in each group are filtered by min-map and max-map ranges... """ # report the module typer.secho( "filter: filter kmers based on frequency in case/control groups", fg=typer.colors.MAGENTA, bold=False, ) # set the loglevel set_loglevel(loglevel) typer.secho( f"loglevel: {loglevel}, logfile: STDERR", fg=typer.colors.MAGENTA, bold=False, ) # fake data if traits_file: traits_dict = get_traits_dict_from_csv(traits_file) else: traits_dict = {0: [], 1: []} traits_dict[0].extend(group0) traits_dict[1].extend(group1) # load database with phenotypes data try: kgp = Kfilter( json_file=json_file, traits_dict=traits_dict, min_cov=min_cov, min_map={ 0: min_map[0], 1: min_map[1] }, max_map={ 0: max_map[0], 1: max_map[1] }, min_map_canon={ 0: 0.0, 1: 0.5 }, ) kgp.run(force=force) except KmerkitError: typer.Abort()