def train_model(working_dir, viral_ftrfile, nonviral_ftrfile, balanced, jobs, use_conda_off, snakemake_args): '''Training customized classifier model. ''' DEFAULT_CONFIG = get_default_config() if balanced == None: balanced = False cmd = ('snakemake --snakefile {snakefile} ' '--directory {working_dir} ' '--config ' 'Viral_ftrfile={viral_ftrfile} ' 'Nonviral_ftrfile={nonviral_ftrfile} ' 'Balanced={balanced} ' 'Jobs={jobs} ' '--jobs {jobs} --rerun-incomplete --latency-wait 600 ' '--nolock --quiet {use_conda_off} {conda_prefix} ' '{add_args} {args}').format( snakefile=get_snakefile('rules/train-model.smk'), working_dir=working_dir, viral_ftrfile=viral_ftrfile, nonviral_ftrfile=nonviral_ftrfile, balanced=balanced, jobs=jobs, use_conda_off='' if use_conda_off else '--use-conda', conda_prefix='' if use_conda_off else '--conda-prefix {}'.format( os.path.join(DEFAULT_CONFIG['DBDIR'], 'conda_envs')), add_args=('' if snakemake_args and snakemake_args[0].startswith('-') else '--'), args=' '.join(snakemake_args), ) logging.info('Executing: %s' % cmd) try: subprocess.run(cmd, check=True, shell=True) except subprocess.CalledProcessError as e: # removes the traceback #logging.critical(e) exit(1)
import os import screed import numpy as np import pandas as pd import click from ruamel.yaml import YAML script_dir = os.path.dirname(os.path.abspath(__file__)) snakefile_dir = os.path.dirname(script_dir) pkg_dir = os.path.dirname(snakefile_dir) sys.path.append(pkg_dir) from virsorter.config import get_default_config, set_logger DEFAULT_CONFIG = get_default_config() D = DEFAULT_CONFIG['GROUP_INFO'] DEFAULT_MIN_SIZE_ALLOWED_WO_HALLMARK_GENE = \ DEFAULT_CONFIG['DEFAULT_MIN_SIZE_ALLOWED_WO_HALLMARK_GENE'] CONTEXT_SETTINGS = {'help_option_names': ['-h', '--help']} @click.command(context_settings=CONTEXT_SETTINGS) @click.option('--hallmark-required', is_flag=True, default=False, help='require hallmark gene') @click.option('--hallmark-required-on-short', is_flag=True, default=False, help='require hallmark gene on short seqs') @click.option('--viral-gene-required', is_flag=True, default=False, help='require viral gene') @click.argument('config', type=click.Path()) @click.argument('intable', type=click.Path()) @click.argument('inseqfile', type=click.Path())
def config(show, show_source, init_source, db_dir, set, get): '''CLI for managing configurations. There are many configurations kept in "template-config.yaml" in source code directory or "~/.virsorter" (when source code directory is not writable for user). This file can located with `virsorter config --show-source`. You can set the configurations with `virsorter config --set KEY=VAL`. Alternative, you can edit in the configuration file ("template-config.yaml") directly. ''' from virsorter.config import (TEMPLATE, SRC_CONFIG_DIR, USER_CONFIG_DIR, init_config_template) if init_source: if db_dir == None: mes = '--db-dir is required for --init-source' logging.critical(mes) sys.exit(1) else: if not os.path.isdir(db_dir): mes = (f'--db-dir {db_dir} does NOT exist yet; Make sure it ' 'is created later\n') logging.warning(mes) db_dir = os.path.abspath(db_dir) init_config_template(SRC_CONFIG_DIR, USER_CONFIG_DIR, db_dir) sys.exit(0) if not os.path.isfile(TEMPLATE): mes = ('config file "template-config.yaml" has not been ' 'initialized yet; Please use ' '`virsorter config --init-source --db-dir PATH` to initialize') logging.critical(mes) sys.exit(1) config = get_default_config() if show: YAML().dump(config, sys.stdout) sys.exit(0) if show_source: mes = f'config file path: {TEMPLATE}\n' sys.stdout.write(mes) sys.exit(0) if get != None: s = get lis = [var.strip() for var in s.split(',')] for var in lis: temp = config for i in var.split('.'): i = i.strip() try: temp = temp[i] except KeyError as e: mes = f'{i} is not a key in config file ({TEMPLATE})' logging.critical(mes) sys.exit(1) mes = f'{var}: {temp}\n' sys.stdout.write(mes) sys.exit(0) if set != None: s = set lis = [item.strip() for item in s.split(',')] for item in lis: temp = config var, val = item.split('=') var = var.strip() val = val.strip() keys = [key.strip() for key in var.split('.')] for i in range(len(keys)): if i == (len(keys) - 1): # stop at 2nd last key break key = keys[i] try: temp = temp[key] except KeyError as e: mes = f'{key} is not a key in config file ({TEMPLATE})' logging.critical(mes) sys.exit(1) last_key = keys[-1] try: old_val = temp[last_key] if isinstance(old_val, int): try: val = int(val) except ValueError as e: mes = f'{var} is supposed to be an integer' logging.critical(mes) sys.exit(1) elif isinstance(old_val, float): val = float(val) try: val = float(val) except ValueError as e: mes = f'{var} is supposed to be a float' logging.critical(mes) sys.exit(1) # only convert to abspath when the old one exists # since sometimes just want to set relative path elif os.path.exists(old_val): val = os.path.abspath(val) temp[last_key] = val except KeyError as e: mes = f'{last_key} is not a key in config file ({TEMPLATE})' logging.critical(mes) sys.exit(1) mes = f'{var}: {old_val} ==> {val}\n' sys.stdout.write(mes) with open(TEMPLATE, 'w') as fw: YAML().dump(config, fw) sys.exit(0)
def train_feature(working_dir, seqfile, hmm, hallmark, prodigal_train, frags_per_genome, min_length, max_orf_per_seq, genome_as_bin, jobs, use_conda_off, snakemake_args): '''Training features for customized classifier. Executes a snakemake workflow to do the following: 1) prepare random DNA fragments from viral and nonviral genome data 2) extract feature from random DNA fragments to make ftrfile ''' DEFAULT_CONFIG = get_default_config() cwd = os.getcwd() lis = [] pat_lis = [] for pat in seqfile: # only works in linux if pat.startswith('/'): new_pat = pat else: new_pat = '{}/{}'.format(cwd, pat) fs = glob.glob(pat) lis.extend(fs) pat_lis.append(new_pat) if len(lis) == 0: mes = 'No files match {}'.format(viral_seqfile) logging.critical(mes) sys.exit(1) else: mes = '{} seqfiles are used for training features'.format(len(lis)) logging.info(mes) if hmm == None: hmm = 'NA' if hallmark == None: hallmark = 'NA' if prodigal_train == None: prodigal_train = 'NA' cmd = ('snakemake --snakefile {snakefile} ' '--directory {working_dir} ' '--config Viral_seqfile="{seqfile}" ' 'Hmm={hmm} ' 'Hallmark={hallmark} ' 'Rbs={prodigal_train} ' 'Min_length={min_length} ' 'Max_orf_per_seq={max_orf_per_seq} ' 'Viral_genome_as_bin={genome_as_bin} ' 'Fragments_per_genome={frags_per_genome} ' '--jobs {jobs} --rerun-incomplete --latency-wait 600 ' '--nolock --quiet {use_conda_off} {conda_prefix} ' '{add_args} {args}').format( snakefile=get_snakefile('rules/train-feature.smk'), working_dir=working_dir, seqfile=' '.join(pat_lis), hmm=hmm, hallmark=hallmark, prodigal_train=prodigal_train, min_length=min_length, max_orf_per_seq=max_orf_per_seq, genome_as_bin=genome_as_bin, frags_per_genome=frags_per_genome, jobs=jobs, use_conda_off='' if use_conda_off else '--use-conda', conda_prefix='' if use_conda_off else '--conda-prefix {}'.format( os.path.join(DEFAULT_CONFIG['DBDIR'], 'conda_envs')), add_args=('' if snakemake_args and snakemake_args[0].startswith('-') else '--'), args=' '.join(snakemake_args), ) logging.info('Executing: %s' % cmd) try: subprocess.run(cmd, check=True, shell=True) except subprocess.CalledProcessError as e: # removes the traceback #logging.critical(e) exit(1)
def config(show, show_source, init_source, db_dir, set, get): '''CLI for managing configurations. There are many configurations kept in "template-config.yaml" in source code directory or "~/.virsorter" (when source code directory is not writable for user). This file can located with `virsorter config --show-source`. You can set the configurations with `virsorter config --set KEY=VAL`. Alternative, you can edit in the configuration file ("template-config.yaml") directly. ''' from virsorter.config import (TEMPLATE, SRC_CONFIG_DIR, USER_CONFIG_DIR, init_config_template) if init_source: if db_dir == None: mes = '--db-dir is required for --init-source' logging.critical(mes) sys.exit(1) else: init_config_template(SRC_CONFIG_DIR, USER_CONFIG_DIR, db_dir) sys.exit(0) if not os.path.isfile(TEMPLATE): mes = ('config file "template-config.yaml" has not been ' 'initialized yet; Please use ' '`virsorter config --init-source --db-dir PATH` to initialize') logging.critical(mes) sys.exit(1) config = get_default_config() if show: YAML().dump(config, sys.stdout) sys.exit(0) if show_source: mes = f'config file path: {TEMPLATE}\n' sys.stdout.write(mes) sys.exit(0) if get != None: s = get lis = [var.strip() for var in s.split(',')] for var in lis: temp = config for i in var.split('.'): i = i.strip() try: temp = temp[i] except KeyError as e: mes = f'{i} is not a key in config file ({TEMPLATE})' logging.critical(mes) sys.exit(1) mes = f'{var}: {temp}\n' sys.stdout.write(mes) sys.exit(0) if set != None: s = set lis = [item.strip() for item in s.split(',')] for item in lis: temp = config var, val = item.split('=') var = var.strip() val = val.strip() keys = [key.strip() for key in var.split('.')] for i in range(len(keys)): if i == (len(keys) - 1): # stop at 2nd last key break key = keys[i] try: temp = temp[key] except KeyError as e: mes = f'{key} is not a key in config file ({TEMPLATE})' logging.critical(mes) sys.exit(1) last_key = keys[-1] try: old_val = temp[last_key] temp[last_key] = val except KeyError as e: mes = f'{last_key} is not a key in config file ({TEMPLATE})' logging.critical(mes) sys.exit(1) mes = f'{var}: {old_val} ==> {val}\n' sys.stdout.write(mes) with open(TEMPLATE, 'w') as fw: YAML().dump(config, fw) sys.exit(0)