def __init__(self, pipeline): rw_conv = partial(cv.rel_file_rw_validator, cfg_path=pipeline.cfg_path) r_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path) cfg_fmt = [ ('mock_table', cv.Annot(str, converter=rw_conv)), ('mock_pileup', cv.Annot(str, converter=r_conv)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): cfg_fmt = [ ('kmer_length', cv.Annot(int, default=7, converter=cv.nonneg_integer)), ('extra_flags', cv.Annot(list, default=[])), ('outdir_name', cv.Annot(str, 'fastQC')), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): cfg_fmt = [ ('clip_len', cv.Annot(int, default=10, converter=cv.nonneg_integer)), ('clipped_5prime_bc', cv.Annot(bool, default=False, converter=cv.boolean)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): cfg_fmt = [ ('output_prefix', cv.Annot(str)), ('min_cov', cv.Annot(int, default=5, converter=cv.nonneg_integer)), ('y_axis_limit', cv.Annot(float, default=0)), ('remove_tmp_files', cv.Annot(bool, default=True)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): relpath_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path) cfg_fmt = [ ('file_postfix', cv.Annot(str, default='fil')), ('padding_bp', cv.Annot(int, default=10, converter=cv.nonneg_integer)), ('features', cv.Annot(list, default=[])), ('filter_gff', cv.Annot(str, converter=relpath_conv)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): rw_conv = partial(cv.rel_file_rw_validator, cfg_path=pipeline.cfg_path) r_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path) nuc_validator = partial(cv.in_set_validator, item_set='ACGT') cfg_fmt = [('sites_file', cv.Annot(str, converter=rw_conv)), ('gff_file', cv.Annot(str, default='', converter=r_conv)), ('fasta_file', cv.Annot(str, converter=r_conv)), ('transition_nucleotide', cv.Annot(str, default='T', converter=nuc_validator))] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): rw_conv = partial(cv.rel_file_rw_validator, cfg_path=pipeline.cfg_path) r_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path) cfg_fmt = [ ('mock_model', cv.Annot(converter=rw_conv)), ('mock_statistics', cv.Annot(converter=r_conv)), ('n_mixture_components', cv.Annot(converter=cv.nonneg_integer)), ('em_iterations', cv.Annot(default=250, converter=cv.nonneg_integer)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): relgen_conv = partial(cv.rel_mapindex_validator, cfg_path=pipeline.cfg_path) cfg_fmt = [ ('genome_index', cv.Annot(str, converter=relgen_conv)), ('n_mismatch', cv.Annot(int, default=1, converter=cv.nonneg_integer)), ('n_multimap', cv.Annot(int, default=1, converter=cv.nonneg_integer)), ('extra_flags', cv.Annot(str, default=[], converter=cv.comma_sep_args)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): optpath_conv = partial(opt_file_validator, cfg_path=pipeline.cfg_path) genome_conv = partial(cv.rel_genome_validator, cfg_path=pipeline.cfg_path) cfg_fmt = [ ('genome_fasta', cv.Annot(str, converter=genome_conv)), ('output_prefix', cv.Annot(str)), ('kmer_k', cv.Annot(int, default=3, converter=cv.nonneg_integer)), ('first_index', cv.Annot(int, default=0, converter=cv.nonneg_integer)), ('last_index', cv.Annot(int, default=1500, converter=cv.nonneg_integer)), ('width', cv.Annot(int, default=50, converter=cv.nonneg_integer)), ('sort_key', cv.Annot(str, default='occupancy', converter=sort_key_validator)), ('gff_exclude_path', cv.Annot(str, default='', converter=optpath_conv, warn_if_missing=False)), ('gff_padding', cv.Annot(int, default=20, converter=cv.nonneg_integer)), ('remove_tmp_files', cv.Annot(bool, default=True)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): optpath_conv = partial(opt_file_validator, cfg_path=pipeline.cfg_path) relpath_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path) genome_conv = partial(cv.rel_genome_validator, cfg_path=pipeline.cfg_path) cfg_fmt = [ ('genome_fasta', cv.Annot(str, converter=genome_conv)), ('output_prefix', cv.Annot(str)), ('kmer_k', cv.Annot(int, default=3, converter=cv.nonneg_integer)), ('sort_key', cv.Annot(str, default='occ', converter=sort_key_validator)), ('gff_exclude_path', cv.Annot(str, default='', converter=optpath_conv, warn_if_missing=False)), ('use_quantiles', cv.Annot(bool, default=True)), ('negative_set_gff', cv.Annot(str, converter=relpath_conv)), ('n_negative_seqs', cv.Annot(int, default=20000, converter=cv.nonneg_integer)), ('remove_tmp_files', cv.Annot(bool, default=True)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline, keep_all=False, cfg_req=[]): """Initializes a new module with empty list of queued commands Args: pipeline: the pipeline the module is queued in keep_all (:obj:`boolean`): do not remove any temporary files cfg_req (:obj:`list`): list of configuration options """ cfg_req.extend([ ('keep_all', cv.Annot(cv.boolean, default=False, warn_if_missing=False)), ('module_info', cv.Annot(str, default='', warn_if_missing=False)), ('skip', cv.Annot(cv.boolean, default=False, warn_if_missing=False)), ]) self._default_parameters = {'keep_all', 'module_info', 'skip'} self._cfg_req = OrderedDict() for key, value in cfg_req: self._cfg_req[key] = value self._keep_all = keep_all self._tmp_files = [] self._intermed_files = [] self._cmds = [] self._pipeline = pipeline
def __init__(self, pipeline): cfg_fmt = [ ('plot_dir', cv.Annot(str, default='mockinbird_plots')), ('max_k_mock', cv.Annot(int, default=10, converter=cv.nonneg_integer)), ('min_k', cv.Annot(int, default=2, converter=cv.nonneg_integer)), ('min_post', cv.Annot(float, default=0.1)), ('extra_args', cv.Annot(list, default=[])), ('null_fraction', cv.Annot(float, default=1)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): cfg_fmt = [ ('remove_n_edge_mut', cv.Annot(int, default=0, converter=cv.nonneg_integer)), ('max_mut_per_read', cv.Annot(int, default=1, converter=cv.nonneg_integer)), ('min_base_quality', cv.Annot(int, default=0, converter=cv.nonneg_integer)), ('min_avg_ali_quality', cv.Annot(int, default=20, converter=cv.nonneg_integer)), ('min_mismatch_quality', cv.Annot(int, default=20, converter=cv.nonneg_integer)), ('dump_raw_data', cv.Annot(cv.boolean, default=False)), ('outdir_name', cv.Annot(str, default='bam_analysis')), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): relgen_conv = partial(cv.rel_mapindex_validator, cfg_path=pipeline.cfg_path) cfg_fmt = [ ('genome_index', cv.Annot(str, converter=relgen_conv)), ('n_mismatch', cv.Annot(int, default=1, converter=cv.nonneg_integer)), ('n_multimap', cv.Annot(int, default=1, converter=cv.nonneg_integer)), ('extra_flags', cv.Annot(list, default=[])), ('allow_soft_clipping', cv.Annot(cv.boolean, default=True)), ('outdir_name', cv.Annot(str, default='star_out')), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): cfg_fmt = [ ('min_transitions', cv.Annot(cv.nonneg_integer, default=2)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): r_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path) cfg_fmt = [ ('gff_file', cv.Annot(str, default='', converter=r_conv)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def run(args): inputfile = args.parclip_fastq outputdir = args.output_dir prefix = args.prefix prepare_dir_or_die(outputdir) # activate logging logging_file = os.path.join(outputdir, 'preprocess.log') logger = logging.getLogger() logger.setLevel(LOG_LEVEL_MAP[args.log_level]) formatter = logging.Formatter(LOG_DEFAULT_FORMAT) console_handler = logging.StreamHandler(stream=sys.stdout) console_handler.setFormatter(formatter) logger.addHandler(console_handler) file_handler = logging.FileHandler(logging_file, mode='w') file_handler.setFormatter(formatter) logger.addHandler(file_handler) logger.info('mockinbird version: %s', __version__) logger.info('working directory: %s', os.getcwd()) logger.info('started preprocessing via %r', ' '.join(sys.argv)) config = mu.parse_yaml(args.config_file) def relpath_conv(file_path): return cv.rel_file_r_validator(file_path, args.config_file) def genomefasta_validator(file_path): return cv.rel_genome_validator(file_path, args.config_file) general_fmt = OrderedDict([ ('adapter5prime', cv.Annot(str, converter=cv.dnastr_validator)), ('adapter3prime', cv.Annot(str, converter=cv.dnastr_validator)), ('genomefasta', cv.Annot(str, converter=genomefasta_validator)), ('normalization_pileup', cv.Annot(str, converter=relpath_conv)), ('rmTemp', cv.Annot(cv.boolean, default=True)), ('n_threads', cv.Annot(int, default=2)), ]) reads_fmt = OrderedDict([ ('bc_5prime', cv.Annot(int, default=0, converter=cv.nonneg_integer)), ('bc_3prime', cv.Annot(int, default=0, converter=cv.nonneg_integer)), ('min_len', cv.Annot(int, default=20, converter=cv.nonneg_integer)), ('reference_nucleotide', cv.Annot(str, default='T', converter=cv.dnanuc_validator)), ('mutation_nucleotide', cv.Annot(str, default='C', converter=cv.dnanuc_validator)), ]) mandatory_sections = 'pipeline', 'general', 'reads' for section in mandatory_sections: if section not in config: logger.error('the config file does not define the mandatory section %s', section) sys.exit(1) try: general_raw = config['general'] general_cfg = cv.validate_section(general_raw, general_fmt) except cv.ConfigError: logger.error('Error while parsing section %r', 'general') sys.exit(1) general_cfg['prefix'] = prefix general_cfg['output_dir'] = outputdir try: reads_raw = config['reads'] reads_cfg = cv.validate_section(reads_raw, reads_fmt) except cv.ConfigError: logger.error('Error while parsing section %r', 'reads') sys.exit(1) initial_files = {'fastq': inputfile} if 'custom_files' in config: for fmt, path in config['custom_files'].items(): if isinstance(path, str): try: rel_path = relpath_conv(path) initial_files[fmt] = rel_path except ValueError: logger.warning('key %r: invalid file path %r', fmt, path) gencfg = { 'reads': reads_cfg, 'general': general_cfg, } pipeline = pl.Pipeline(initial_files=initial_files, general_cfg=gencfg, cfg_path=args.config_file) mu.queue_pipeline(config, pipeline, def_lookup_path='mockinbird.utils.preprocess_modules') mu.run_pipeline(pipeline) if general_cfg['rmTemp']: pipeline.cleanup() logger.info('all done. See you soon!')
def __init__(self, pipeline): cfg_fmt = [ ('max_quantile', cv.Annot(float, default=0.95)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): cfg_fmt = [ ('outdir_name', cv.Annot(str, 'bam_analysis')), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): cfg_fmt = [ ('quality_cutoff', cv.Annot(int, default=30, converter=cv.nonneg_integer)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): cfg_fmt = [ ('extra_args', cv.Annot(list, default=[])), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): relpath_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path) cfg_fmt = [ ('gff_file', cv.Annot(str, converter=relpath_conv)), ('output_prefix', cv.Annot(str)), ('downstream_bp', cv.Annot(int, default=1000, converter=cv.nonneg_integer)), ('upstream_bp', cv.Annot(int, default=1000, converter=cv.nonneg_integer)), ('gene_bp', cv.Annot(int, default=750, converter=cv.nonneg_integer)), ('min_trscr_size_bp', cv.Annot(int, default=1500, converter=cv.nonneg_integer)), ('max_trscr_size_bp', cv.Annot(int, default=100000, converter=cv.nonneg_integer)), ('smoothing_window', cv.Annot(int, default=20, converter=cv.nonneg_integer)), ('labelCenterA', cv.Annot(str)), ('labelCenterB', cv.Annot(str)), ('labelBody', cv.Annot(str)), ('remove_tmp_files', cv.Annot(bool, default=True)), ('bootstrap_iter', cv.Annot(int, default=2500, converter=cv.nonneg_integer)), ('n_processes', cv.Annot(int, default=4, converter=cv.nonneg_integer)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): cfg_fmt = [ ('pval_threshold', cv.Annot(float, default=0.005)), ('min_cov', cv.Annot(int, default=2, converter=cv.nonneg_integer)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): cfg_fmt = [ ('mut_snp_ratio', cv.Annot(float, default=0.75)), ] super().__init__(pipeline, cfg_req=cfg_fmt)
def __init__(self, pipeline): relpath_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path) cfg_fmt = [ ('gff_file', cv.Annot(str, converter=relpath_conv)), ('output_prefix', cv.Annot(str)), ('downstream_bp', cv.Annot(int, default=500, converter=cv.nonneg_integer)), ('upstream_bp', cv.Annot(int, default=1000, converter=cv.nonneg_integer)), ('min_trscr_size_bp', cv.Annot(int, default=0, converter=cv.nonneg_integer)), ('max_trscr_size_bp', cv.Annot(int, default=5000, converter=cv.nonneg_integer)), ('xbins', cv.Annot(int, default=500, converter=cv.nonneg_integer)), ('ybins', cv.Annot(int, default=500, converter=cv.nonneg_integer)), ('x_pixels', cv.Annot(int, default=500, converter=cv.nonneg_integer)), ('y_pixels', cv.Annot(int, default=500, converter=cv.nonneg_integer)), ('remove_tmp_files', cv.Annot(bool, default=True)), ] super().__init__(pipeline, cfg_req=cfg_fmt)