def main(): """main function """ default_parser = default_argparser(CFG_DIR) parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args parser.add_argument('--sample-bam-map', required=True, help="Yaml file listing BAM file input (value)" " per sample (key; reused for output filenames here)") args = parser.parse_args() # FIXME how to remove the arguments froma argparser in the first place? assert not args.sample_cfg, ("Usual sample config not supported. Replaced in this pipeline with --sample-bam-map") # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # turn arguments into cfg_dict (gets merged with other configs late) # cfg_dict = dict() cfg_dict['readunits'] = dict() cfg_dict['samples'] = dict() with open(args.sample_bam_map) as fh: sample_bam_map = dict(yaml.safe_load(fh)) for sample, bam in sample_bam_map.items(): assert os.path.exists(bam) # if we have relative paths, make them abs relative to cfgfile if not os.path.isabs(bam): bam = os.path.abspath(os.path.join(os.path.dirname(args.sample_bam_map), bam)) sample_bam_map[sample] = bam cfg_dict['sample_bam_map'] = sample_bam_map pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def LDRefine(args): logger.info("Preparing LD-based genotype refinement pipeline...") print_parameters_given(args) assert os.path.exists("varCall"), "Cannot detect the directory of varaiant detection.\nWEScall varCall has to be run before LD-based genotype refinement." assert args.num_record_per_file > 0, "Number of records per file has to be larger than 1!" assert args.num_overlap_record >= 0, "Number of overlapping records has to be larger than 1!" assert args.num_record_per_file>args.num_overlap_record, "Number of records per file has to be larger than the number of overlapping records." if not os.path.exists("LDRefine"): os.mkdir("LDRefine") LDRefine_cfg=dict() LDRefine_cfg["num_record_per_file"]=args.num_record_per_file LDRefine_cfg["num_overlap_record"]=args.num_overlap_record PIPELINE_BASEDIR = os.path.join(os.path.dirname(sys.argv[0])) CFG_DIR = os.path.join(PIPELINE_BASEDIR, "cfg") path_cluster_cfg=os.path.join(PIPELINE_BASEDIR,"cfg","cluster.LDRefine.yaml") # has to merge cluster with open(path_cluster_cfg, 'r') as fh: cluster_cfg = yaml.safe_load(fh) # turn arguments into user_data that gets merged into pipeline config # # generic data first user_data = dict() user_data['cluster'] = cluster_cfg user_data['LDRefine'] = LDRefine_cfg pipeline_handler = PipelineHandler( "WEScall_LDRefine", PIPELINE_BASEDIR, "LDRefine",user_data, Snakefile="pipelines/LDRefine/Snakefile.beagle."+get_seq_type_from_user_cfg(args.userCfg), cluster_cfgfile=path_cluster_cfg, user_cfgfile=args.userCfg) pipeline_handler.setup_env() pipeline_handler.submit(no_run=True)
def varCall(args): logger.info("Preparing varCall pipeline...") print_parameters_given(args) logger.info("Validating sample index ...") validate_sample_list_file(args) logger.info("Validating user config file ...") validate_user_cfg(args) logger.info("Checking existence of essenstial resource files...") check_resource_files_for_varCall() logger.info("Checking dependencies...") check_dependencies() pipeline_handler = PipelineHandler( "WEScall_varCall", PIPELINE_BASEDIR, Snakefile="pipelines/varCall/Snakefile."+get_seq_type_from_user_cfg(args.userCfg), outdir="./varCall", user_data="", user_cfgfile=args.userCfg, cluster_cfgfile=CFG_DIR+"/cluster.varCall.yaml" ) os.system("mkdir -p ./varCall/data") shutil.copy2(args.sample_list,"./varCall/data/samples.index") # automatically generate the pedigree file for the user # Since WEScall does not utilize pedigree information, the pedigree file # is just a formality so as to let the pipeline run with open(args.sample_list) as f_in, open("./varCall/data/samples.ped","w") as f_out: for line in f_in: record = line.strip().split("\t") f_out.write("{smp}\t{smp}\t{smp}\t0\t0\n".format(smp=record[0])) pipeline_handler.setup_env() pipeline_handler.submit(no_run=True)
def main(): """main function """ parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version())) # generic args parser.add_argument('-o', "--outdir", required=True, help="Output directory (may not exist)") parser.add_argument('--name', help="Give this analysis run a name (used in email and report)") parser.add_argument('--no-mail', action='store_true', help="Don't send mail on completion") #site = get_site() default = get_default_queue('slave') parser.add_argument('-w', '--slave-q', default=default, help="Queue to use for slave jobs (default: {})".format(default)) default = get_default_queue('master') parser.add_argument('-m', '--master-q', default=default, help="Queue to use for master job (default: {})".format(default)) parser.add_argument('-n', '--no-run', action='store_true') parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") cfg_group = parser.add_argument_group('Configuration files (advanced)') cfg_group.add_argument('--sample-cfg', help="Config-file (YAML) listing samples and readunits." " Collides with -1, -2 and -s") for name, descr in [("references", "reference sequences"), ("params", "parameters"), ("modules", "modules")]: default = os.path.abspath(os.path.join(CFG_DIR, "{}.yaml".format(name))) cfg_group.add_argument('--{}-cfg'.format(name), default=default, help="Config-file (yaml) for {}. (default: {})".format(descr, default)) # pipeline specific args parser.add_argument('-1', "--fq1", nargs="+", help="FastQ file/s (gzip only)." " Multiple input files supported (auto-sorted)." " Note: each file (or pair) gets a unique read-group id." " Collides with --sample-cfg.") parser.add_argument('-2', "--fq2", nargs="+", help="FastQ file/s (if paired) (gzip only). See also --fq1") parser.add_argument('-s', "--sample", help="Sample name. Collides with --sample-cfg.") parser.add_argument('-t', "--seqtype", required=True, choices=['WGS', 'WES', 'targeted'], help="Sequencing type") parser.add_argument('-l', "--intervals", help="Intervals file (e.g. bed file) listing regions of interest." " Required for WES and targeted sequencing.") parser.add_argument('-D', '--dont-mark-dups', action='store_true', help="Don't mark duplicate reads") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.sample_cfg: if any([args.fq1, args.fq2, args.sample]): logger.fatal("Config file overrides fastq and sample input arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.sample_cfg): logger.fatal("Config file %s does not exist", args.sample_cfg) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile(args.sample_cfg) else: if not all([args.fq1, args.sample]): logger.fatal("Need at least fq1 and sample without config file") sys.exit(1) readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples = dict() samples[args.sample] = list(readunits.keys()) # FIXME how to? #for p in ['bwa', 'samtools']: # if not ref_is_indexed(args.reffa, p): # logger.fatal("Reference '%s' doesn't appear to be indexed with %s", args.reffa, p) # sys.exit(1) if args.seqtype in ['WES', 'targeted']: if not args.intervals: logger.fatal("Analysis of exome and targeted sequence runs requires a bed file") sys.exit(1) else: if not os.path.exists(args.intervals): logger.fatal("Intervals file %s does not exist", args.sample_cfg) sys.exit(1) logger.warning("Compatilibity between interval file and" " reference not checked")# FIXME # turn arguments into user_data that gets merged into pipeline config # # generic data first user_data = dict() user_data['mail_on_completion'] = not args.no_mail user_data['readunits'] = readunits user_data['samples'] = samples if args.name: user_data['analysis_name'] = args.name user_data['seqtype'] = args.seqtype user_data['intervals'] = args.intervals user_data['mark_dups'] = not args.dont_mark_dups pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args.outdir, user_data, master_q=args.master_q, slave_q=args.slave_q, params_cfgfile=args.params_cfg, modules_cfgfile=args.modules_cfg, refs_cfgfile=args.references_cfg, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(): """main function """ default_parser = default_argparser(CFG_DIR) parser = configargparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args parser.add_argument( "--control-fq1", nargs="+", help="Control FastQ file/s (gzip only)." " Multiple input files supported (auto-sorted)." " Note: each file (or pair) gets a unique read-group id." " Collides with --sample-cfg.") parser.add_argument( '--control-fq2', nargs="+", help= "Control FastQ file/s (if paired) (gzip only). See also --control-fq1") parser.add_argument( "--treatment-fq1", nargs="+", help="Treatment FastQ file/s (gzip only)." " Multiple input files supported (auto-sorted)." " Note: each file (or pair) gets a unique read-group id." " Collides with --sample-cfg.") parser.add_argument( '--treatment-fq2', nargs="+", help= "Treatment FastQ file/s (if paired) (gzip only). See also --treatment-fq1" ) parser.add_argument( '--control-bam', help="Advanced: Injects control BAM (overwrites control-fq options)." " WARNING: reference and postprocessing need to match pipeline requirements" ) parser.add_argument( '--treatment-bam', help="Advanced: Injects treatment BAM (overwrites treatment-fq options)." " WARNING: reference and postprocessing need to match pipeline requirements" ) choices = ['bwa-aln', 'bwa-mem'] default = choices[0] parser.add_argument('--mapper', default=default, choices=choices, help="Mapper to use. One of {}. Default {}".format( ",".join(choices), default)) choices = ['TF', 'histone-narrow', 'histone-broad'] #, 'open-chromatin'] parser.add_argument('-t', '--peak-type', required=True, choices=choices, help="Peak type. One of {}".format(",".join(choices))) parser.add_argument('--skip-macs2', action='store_true', help="Don't run MACS2") parser.add_argument('--skip-dfilter', action='store_true', help="Don't run DFilter") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.sample_cfg: if any([ args.control_fq1, args.control_fq2, args.treatment_fq1, args.treatment_fq2, args.control_bam, args.treatment_bam ]): logger.fatal( "Config file overrides fastq and sample input arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.sample_cfg): logger.fatal("Config file %s does not exist", args.sample_cfg) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile( args.sample_cfg) else: samples = dict() if args.control_bam: control_readunits = dict() samples["control"] = [] assert os.path.exists(args.control_bam) else: if not all([args.control_fq1, args.treatment_fq1]): logger.fatal( "Need at least fq1 and sample without config file") sys.exit(1) control_readunits = get_readunits_from_args( args.control_fq1, args.control_fq2) samples["control"] = list(control_readunits.keys()) if args.treatment_bam: treatment_readunits = dict() samples["treatment"] = [] assert os.path.exists(args.treatment_bam) else: treatment_readunits = get_readunits_from_args( args.treatment_fq1, args.treatment_fq2) samples["treatment"] = list(treatment_readunits.keys()) readunits = dict(control_readunits) readunits.update(treatment_readunits) assert sorted(samples) == sorted(["control", "treatment"]) # turn arguments into cfg_dict that gets merged into pipeline config # cfg_dict = dict() cfg_dict['readunits'] = readunits cfg_dict['samples'] = samples # either paired end or not, but no mix allows if all([ru.get('fq2') for ru in readunits.values()]): cfg_dict['paired_end'] = True elif not any([ru.get('fq2') for ru in readunits.values()]): cfg_dict['paired_end'] = False else: logger.fatal("Mixed paired-end and single-end not allowed") sys.exit(1) cfg_dict['peak_type'] = args.peak_type cfg_dict['mapper'] = args.mapper cfg_dict['skip_macs2'] = args.skip_macs2 cfg_dict['skip_dfilter'] = args.skip_dfilter pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() if args.control_bam or args.treatment_bam: raise NotImplementedError("BAM injection not implemented yet") pipeline_handler.submit(args.no_run)
def main(): """main function """ default_parser = default_argparser(CFG_DIR) parser = configargparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args parser.add_argument( "--normal-fq1", nargs="+", help="Normal FastQ file/s (gzip only)." " Multiple input files supported (auto-sorted)." " Note: each file (or pair) gets a unique read-group id." " Collides with --sample-cfg.") parser.add_argument( '--normal-fq2', nargs="+", help= "Normal FastQ file/s (if paired) (gzip only). See also --normal-fq1") parser.add_argument( "--tumor-fq1", nargs="+", help="Tumor FastQ file/s (gzip only)." " Multiple input files supported (auto-sorted)." " Note: each file (or pair) gets a unique read-group id." " Collides with --sample-cfg.") parser.add_argument( '--tumor-fq2', nargs="+", help="Tumor FastQ file/s (if paired) (gzip only). See also --tumor-fq1" ) parser.add_argument('-t', "--seqtype", required=True, choices=['WGS', 'WES', 'targeted'], help="Sequencing type") parser.add_argument('-l', "--bed", help="Bed file listing regions of interest." " Required for WES and targeted sequencing.") parser.add_argument('-D', '--dont-mark-dups', action='store_true', help="Don't mark duplicate reads") parser.add_argument( '--normal-bam', help="Advanced: Injects normal BAM (overwrites normal-fq options)." " WARNING: reference and postprocessing need to match pipeline requirements" ) parser.add_argument( '--tumor-bam', help="Advanced: Injects tumor BAM (overwrites tumor-fq options)." " WARNING: reference and postprocessing need to match pipeline requirements" ) args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.sample_cfg: if any([ args.normal_fq1, args.normal_fq2, args.tumor_fq1, args.tumor_fq2, args.normal_bam, args.tumor_bam ]): logger.fatal( "Config file overrides fastq and sample input arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.sample_cfg): logger.fatal("Config file %s does not exist", args.sample_cfg) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile( args.sample_cfg) else: samples = dict() if args.normal_bam: normal_readunits = dict() samples["normal"] = [] assert os.path.exists(args.normal_bam) else: if not all([args.normal_fq1, args.tumor_fq1]): logger.fatal( "Need at least fq1 and sample without config file") sys.exit(1) normal_readunits = get_readunits_from_args(args.normal_fq1, args.normal_fq2) samples["normal"] = list(normal_readunits.keys()) if args.tumor_bam: tumor_readunits = dict() samples["tumor"] = [] assert os.path.exists(args.tumor_bam) else: tumor_readunits = get_readunits_from_args(args.tumor_fq1, args.tumor_fq2) samples["tumor"] = list(tumor_readunits.keys()) readunits = dict(normal_readunits) readunits.update(tumor_readunits) assert sorted(samples) == sorted(["normal", "tumor"]) # FIXME howt to # if not os.path.exists(reffa): # logger.fatal("Reference '%s' doesn't exist", reffa) # sys.exit(1) # #for p in ['bwa', 'samtools']: # if not ref_is_indexed(reffa, p): # logger.fatal("Reference '%s' doesn't appear to be indexed with %s", reffa, p) # sys.exit(1) if args.seqtype in ['WES', 'targeted']: if not args.bed: logger.fatal( "Analysis of exome and targeted sequence runs requires a bed file" ) sys.exit(1) else: if not os.path.exists(args.bed): logger.fatal("Bed file %s does not exist", args.sample_cfg) sys.exit(1) logger.warning("Compatilibity between bed file and" " reference not checked") # FIXME # turn arguments into cfg_dict that gets merged into pipeline config # cfg_dict = dict() cfg_dict['readunits'] = readunits cfg_dict['samples'] = samples cfg_dict['seqtype'] = args.seqtype cfg_dict['intervals'] = os.path.abspath(args.bed) if args.bed else None cfg_dict['mark_dups'] = not args.dont_mark_dups pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() # inject existing BAM by symlinking (everything upstream is temporary anyway) for sample, bam in [("normal", args.normal_bam), ("tumor", args.tumor_bam)]: if bam: # target as defined in Snakefile! target = os.path.join( args.outdir, "out", sample, "{}.bwamem.lofreq.dedup.lacer.bam".format(sample)) os.makedirs(os.path.dirname(target)) os.symlink(os.path.abspath(bam), target) pipeline_handler.submit(args.no_run)
def main(): """main function """ parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version())) # generic args parser.add_argument('-c', "--config", help="Config file (YAML) listing samples and readunits." " Collides with -1, -2 and -s") parser.add_argument('-o', "--outdir", required=True, help="Output directory (must not exist)") parser.add_argument('--no-mail', action='store_true', help="Don't send mail on completion") site = get_site() default = DEFAULT_SLAVE_Q.get(site, None) parser.add_argument('-w', '--slave-q', default=default, help="Queue to use for slave jobs (default: {})".format(default)) default = DEFAULT_MASTER_Q.get(site, None) parser.add_argument('-m', '--master-q', default=default, help="Queue to use for master job (default: {})".format(default)) parser.add_argument('-n', '--no-run', action='store_true') parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") # pipeline specific args parser.add_argument('-1', "--fq1", nargs="+", help="FastQ file/s (gzip only)." " Multiple input files supported (auto-sorted)." " Note: each file (or pair) gets a unique read-group id." " Collides with -c.") parser.add_argument('-2', "--fq2", nargs="+", help="FastQ file/s (if paired) (gzip only). See also --fq1") parser.add_argument('-s', "--sample", help="Sample name. Collides with -c.") fake_pipeline_handler = PipelineHandler("FAKE", PIPELINE_BASEDIR, "FAKE", None) default_cfg = fake_pipeline_handler.read_default_config() default = default_cfg['references']['genome'] parser.add_argument('-r', "--reffa", default=default, help=argparse.SUPPRESS) # WARN do not change. this is just to set args.reffa (used later). # any change here would require changes in dbsnp, hapmap, g1k, omni and mills as well parser.add_argument('-t', "--seqtype", required=True, choices=['WGS', 'WES', 'targeted'], help="Sequencing type") parser.add_argument('-l', "--intervals", help="Intervals file (e.g. bed file) listing regions of interest." " Required for WES and targeted sequencing.") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.config: if any([args.fq1, args.fq2, args.sample]): logger.fatal("Config file overrides fastq and sample input arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.config): logger.fatal("Config file %s does not exist", args.config) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile(args.config) else: if not all([args.fq1, args.sample]): logger.fatal("Need at least fq1 and sample without config file") sys.exit(1) readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples = dict() samples[args.sample] = list(readunits.keys()) if args.seqtype in ['WES', 'targeted']: if not args.intervals: logger.fatal("Analysis of exome and targeted sequence runs requires a bed file") sys.exit(1) else: if not os.path.exists(args.intervals): logger.fatal("Intervals file %s does not exist", args.config) sys.exit(1) logger.warning("Compatilibity between interval file and" " reference not checked")# FIXME # turn arguments into user_data that gets merged into pipeline config # # generic data first user_data = dict() user_data['mail_on_completion'] = not args.no_mail user_data['readunits'] = readunits user_data['samples'] = samples user_data['num_chroms'] = len(list(chroms_and_lens_from_from_fasta(args.reffa))) user_data['seqtype'] = args.seqtype user_data['intervals'] = args.intervals# always safe, might be used for WGS as well user_data['mark_dups'] = MARK_DUPS pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args.outdir, user_data, site=site, master_q=args.master_q, slave_q=args.slave_q) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(): """main function """ default_parser = default_argparser(CFG_DIR) parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args parser.add_argument( '-1', "--fq1", nargs="+", help="FastQ file/s (gzip only)." " Multiple input files supported (auto-sorted)." " Note: each file (or pair) gets a unique read-group id." " Collides with --sample-cfg.") parser.add_argument( '-2', "--fq2", nargs="+", help="FastQ file/s (if paired) (gzip only). See also --fq1") parser.add_argument('-s', "--sample", help="Sample name. Collides with --sample-cfg.") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.sample_cfg: if any([args.fq1, args.fq2, args.sample]): logger.fatal( "Config file overrides fastq and sample input arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.sample_cfg): logger.fatal("Config file %s does not exist", args.sample_cfg) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile( args.sample_cfg) else: if not all([args.fq1, args.sample]): logger.fatal("Need at least fq1 and sample without config file") sys.exit(1) readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples = dict() samples[args.sample] = list(readunits.keys()) # FIXME now exported to ref.cfg. how to auto check there? #if not os.path.exists(args.reffa): # logger.fatal("Reference '%s' doesn't exist", args.reffa) # sys.exit(1) # #for p in ['bwa', 'samtools']: # if not ref_is_indexed(args.reffa, p): # logger.fatal("Reference '%s' doesn't appear to be indexed" # " with %s", args.reffa, p) # sys.exit(1) # turn arguments into cfg_dict that gets merged into pipeline config # cfg_dict = dict() cfg_dict['readunits'] = readunits cfg_dict['samples'] = samples cfg_dict['mark_dups'] = MARK_DUPS pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(): """main function """ parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version())) # generic args parser.add_argument('-c', "--config", help="Config file (YAML) listing samples and readunits." " Collides with -1, -2 and -s") parser.add_argument('-o', "--outdir", required=True, help="Output directory (must not exist)") parser.add_argument('--no-mail', action='store_true', help="Don't send mail on completion") site = get_site() default = DEFAULT_SLAVE_Q.get(site, None) parser.add_argument('-w', '--slave-q', default=default, help="Queue to use for slave jobs (default: {})".format(default)) default = DEFAULT_MASTER_Q.get(site, None) parser.add_argument('-m', '--master-q', default=default, help="Queue to use for master job (default: {})".format(default)) parser.add_argument('-n', '--no-run', action='store_true') parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") # pipeline specific args parser.add_argument('-1', "--fq1", nargs="+", help="FastQ file/s (gzip only)." " Multiple input files supported (auto-sorted)." " Note: each file (or pair) gets a unique read-group id." " Collides with -c.") parser.add_argument('-2', "--fq2", nargs="+", help="FastQ file/s (if paired) (gzip only). See also --fq1") parser.add_argument('-s', "--sample", help="Sample name. Collides with -c.") parser.add_argument('-C', "--cuffdiff", action='store_true', dest="run_cuffdiff", help="Also run cuffdiff") parser.add_argument('-S', '--stranded', action='store_true', help="Stranded library prep (default is unstranded)") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.config: if any([args.fq1, args.fq2, args.sample]): logger.fatal("Config file overrides fastq and sample input arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.config): logger.fatal("Config file %s does not exist", args.config) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile(args.config) else: if not all([args.fq1, args.sample]): logger.fatal("Need at least fq1 and sample without config file") sys.exit(1) readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples = dict() samples[args.sample] = list(readunits.keys()) # FIXME checks on reffa index (currently not exposed via args) # turn arguments into user_data that gets merged into pipeline config # # generic data first user_data = dict() user_data['mail_on_completion'] = not args.no_mail user_data['readunits'] = readunits user_data['samples'] = samples user_data['stranded'] = args.stranded user_data['run_cuffdiff'] = args.run_cuffdiff user_data['paired_end'] = any(ru.get('fq2') for ru in readunits.values()) if user_data['paired_end']: assert all(ru.get('fq2') for ru in readunits.values()), ( "Can't handle mix of paired-end and single-end") pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args.outdir, user_data, site=site, master_q=args.master_q, slave_q=args.slave_q) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(): """main function """ default_parser = default_argparser(CFG_DIR, with_readunits=True) parser = configargparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args parser.add_argument("--cuffdiff", action='store_true', dest="run_cuffdiff", help="Also run cuffdiff") choices = ["none", "forward", "reverse"] default = "none" parser.add_argument( '--stranded', choices=choices, default=default, help= "Stranded library prep (default is {}; Following RSEM definition but see also" " http://chipster.csc.fi/manual/library-type-summary.html)".format( default)) parser.add_argument( '--rsem-estimate-rspd', action='store_true', help="Estimate read start position distribution in RSEM") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.sample_cfg: if any([args.fq1, args.fq2, args.sample]): logger.fatal( "Config file overrides fastq and sample input arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.sample_cfg): logger.fatal("Config file %s does not exist", args.sample_cfg) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile( args.sample_cfg) else: if not all([args.fq1, args.sample]): logger.fatal("Need at least fq1 and sample without config file") sys.exit(1) readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples = dict() samples[args.sample] = list(readunits.keys()) # FIXME add checks on reffa index (currently not exposed via args) # turn arguments into cfg_dict that gets merged into pipeline config # cfg_dict = dict() cfg_dict['readunits'] = readunits cfg_dict['samples'] = samples cfg_dict['rsem_extra_args'] = '' if args.rsem_estimate_rspd: cfg_dict['rsem_extra_args'] += ' --estimate-rspd' cfg_dict['stranded'] = args.stranded cfg_dict['run_cuffdiff'] = args.run_cuffdiff cfg_dict['paired_end'] = any(ru.get('fq2') for ru in readunits.values()) if cfg_dict['paired_end']: assert all(ru.get('fq2') for ru in readunits.values()), ( "Can't handle mix of paired-end and single-end") pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(): """main function """ # FIXME ugly and code duplication in bcl2fastq_dbupdate.py mongo_status_script = os.path.abspath( os.path.join(os.path.dirname(sys.argv[0]), "mongo_status.py")) assert os.path.exists(mongo_status_script) default_parser = default_argparser(CFG_DIR, allow_missing_cfgfile=True, allow_missing_outdir=True, default_db_logging=True) parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args parser.add_argument('-r', "--runid", help="Run ID plus flowcell ID (clashes with -d)") parser.add_argument( '-d', "--rundir", help= "BCL input directory (clashes with -r; you also probably want to disable logging)" ) parser.add_argument('-t', "--testing", action='store_true', help="Use MongoDB test server") parser.add_argument('--no-archive', action='store_true', help="Don't archieve this analysis") parser.add_argument( '-l', '--lanes', type=int, nargs="*", help="Limit run to given lane/s (multiples separated by space") parser.add_argument( '-i', '--mismatches', type=int, help="Max. number of allowed barcode mismatches (0>=x<=2)" " setting a value here overrides the default settings read from ELM)") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if args.mismatches is not None: if args.mismatches > 2 or args.mismatches < 0: logger.fatal("Number of mismatches must be between 0-2") sys.exit(1) lane_info = '' lane_nos = [] if args.lanes: lane_info = '--tiles ' for lane in args.lanes: if lane > 8 or lane < 1: logger.fatal("Lane number must be between 1-8") sys.exit(1) else: lane_info += 's_{}'.format(lane) + ',' lane_info = lane_info.rstrip() lane_info = lane_info[:-1] lane_nos = list(args.lanes) if args.runid and args.rundir: logger.fatal( "Cannot use run-id and input directory arguments simultaneously") sys.exit(1) elif args.runid: rundir = run_folder_for_run_id(args.runid) elif args.rundir: rundir = os.path.abspath(args.rundir) else: logger.fatal("Need either run-id or input directory") sys.exit(1) if not os.path.exists(rundir): logger.fatal("Expected run directory %s does not exist", rundir) logger.info("Rundir is %s", rundir) if not args.outdir: outdir = get_bcl2fastq_outdir(args.runid) args.outdir = outdir else: outdir = args.outdir if os.path.exists(outdir): logger.fatal("Output directory %s already exists", outdir) sys.exit(1) # create now so that generate_bcl2fastq_cfg.py can run os.makedirs(outdir) # catch cases where rundir was user provided and looks weird try: _, runid, flowcellid = get_machine_run_flowcell_id(rundir) run_num = runid + "_" + flowcellid except: run_num = "UNKNOWN-" + rundir.split("/")[-1] # call generate_bcl2fastq_cfg # # FIXME ugly assumes same directory (just like import above). better to import and run main()? generate_bcl2fastq = os.path.join(os.path.dirname(sys.argv[0]), "generate_bcl2fastq_cfg.py") assert os.path.exists(generate_bcl2fastq) cmd = [generate_bcl2fastq, '-r', rundir, '-o', outdir] if args.testing: cmd.append("-t") logger.debug("Executing %s", ' '.join(cmd)) try: res = subprocess.check_output(cmd, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: logger.fatal("The following command failed with return code %s: %s", e.returncode, ' '.join(cmd)) logger.fatal("Output: %s", e.output.decode()) logger.fatal("Exiting") os.rmdir(outdir) sys.exit(1) # generate_bcl2fastq is normally quiet. if there's output, make caller aware of it # use sys instead of logger to avoid double logging if res: sys.stderr.write(res.decode()) # just created files muxinfo_cfg = os.path.join(outdir, MUXINFO_CFG) status_cfg = os.path.join(outdir, STATUS_CFG) # NOTE: signal for failed runs is exit 0 from generate_bcl2fastq and missing output files # if any([not os.path.exists(x) for x in [muxinfo_cfg]]): # one missing means all should be missing assert all([not os.path.exists(x) for x in [muxinfo_cfg]]) #Check status as seqrunfailed or non-bcl run with open(status_cfg, 'r') as fh: status = fh.read().strip() update_run_status(mongo_status_script, run_num, outdir, status, args.testing) sys.exit(0) # turn arguments into cfg_dict that gets merged into pipeline config cfg_dict = { 'rundir': rundir, 'lanes_arg': lane_info, 'no_archive': args.no_archive, 'run_num': run_num } mux_units = get_mux_units_from_cfgfile(muxinfo_cfg, lane_nos) if args.mismatches is not None: mux_units = [ mu._replace(barcode_mismatches=args.mismatches) for mu in mux_units ] os.unlink(muxinfo_cfg) cfg_dict['units'] = dict() for mu in mux_units: # special case: mux split across multiple lanes. make lanes a list # and add in extra lanes if needed. k = mu.mux_dir mu_dict = dict(mu._asdict()) cfg_dict['units'][k] = mu_dict # create mongodb update command, used later, after submission mongo_update_cmd = "{} -r {} -s STARTED".format(mongo_status_script, cfg_dict['run_num']) mongo_update_cmd += " -a $ANALYSIS_ID -o {}".format( outdir) # set in run.sh if args.testing: mongo_update_cmd += " -t" pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, logger_cmd=mongo_update_cmd, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(): """main function """ default_parser = default_argparser(CFG_DIR, with_readunits=True) parser = configargparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args parser.add_argument('-t', "--seqtype", required=True, choices=['WGS', 'WES', 'targeted'], help="Sequencing type") parser.add_argument('-l', "--bed", help="Bed file listing regions of interest." " Required for WES and targeted sequencing.") default = 4 parser.add_argument( "-c", "--hc-nct", default=default, type=int, help="Number of Haplotype Caller threads (per region cluster)." " Values>1 reported to make Haplotype Caller unstable (default={})". format(default)) default = 100 parser.add_argument( '-i', "--interval-padding", default=default, help="Interval padding (for non-WGS only; default = {})".format( default)) parser.add_argument( '-j', "--joint-calls", action='store_true', help="Perform joint/cohort calling (requires multisample input)") parser.add_argument( '--raw-bam', help= "Advanced: Injects raw (pre-dedup, pre-BQSR etc.) BAM (overwrites fq options)." " WARNING: reference needs to match pipeline requirements") parser.add_argument( '--proc-bam', help= "Advanced: Injects processed (post-dedup, post-BQSR etc.) BAM (overwrites fq options)." " WARNING: reference and pre-processing need to match pipeline requirements" ) # FIXME can be achieved with --until rule as well parser.add_argument('--bam-only', action='store_true', help="Only process up until BAM file") parser.add_argument('--gvcf-only', action='store_true', help="Only process up until GVCF file") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.sample_cfg: if any([args.fq1, args.fq2, args.sample, args.raw_bam, args.proc_bam]): logger.fatal( "Config file overrides fastq, sample and BAM arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.sample_cfg): logger.fatal("Config file %s does not exist", args.sample_cfg) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile( args.sample_cfg) else: # no sample config, so input is either fastq or existing bam samples = dict() if not args.sample: logger.fatal("Need sample name if not using config file") sys.exit(1) if args.raw_bam or args.proc_bam: assert not args.fq1, ("BAM injection overwrites fastq arguments") if args.raw_bam: assert os.path.exists(args.raw_bam) assert not args.proc_bam, ( "Cannot inject raw and processed BAM") if args.proc_bam: assert os.path.exists(args.proc_bam) assert not args.raw_bam, ( "Cannot inject raw and processed BAM") readunits = dict() samples[args.sample] = [] elif args.fq1: readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples[args.sample] = list(readunits.keys()) else: logger.fatal( "Need at least one fastq files as argument if not using config file" ) sys.exit(1) if args.seqtype in ['WES', 'targeted']: if not args.bed: logger.fatal( "Analysis of exome and targeted sequence runs requires a bed file" ) sys.exit(1) else: if not os.path.exists(args.bed): logger.fatal("Bed file %s does not exist", args.sample_cfg) sys.exit(1) if args.joint_calls: if len(samples) < 2: logger.fatal("Need at least two samples for joint calling") sys.exit(1) # turn arguments into cfg_dict (gets merged with other configs late) # cfg_dict = dict() cfg_dict['readunits'] = readunits cfg_dict['samples'] = samples cfg_dict['seqtype'] = args.seqtype cfg_dict['intervals'] = os.path.abspath( args.bed ) if args.bed else None # always safe, might be used for WGS as well cfg_dict['mark_dups'] = MARK_DUPS cfg_dict['bam_only'] = args.bam_only cfg_dict['gvcf_only'] = args.gvcf_only cfg_dict['hc_nct'] = args.hc_nct cfg_dict['joint_calls'] = args.joint_calls cfg_dict['interval_padding'] = args.interval_padding pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() # Inject existing BAM by symlinking (everything upstream is temporary anyway) # WARNING: filename has to match definition in Snakefile! if args.raw_bam: target = os.path.join(args.outdir, "out", args.sample, "{}.bwamem.bam".format(args.sample)) os.makedirs(os.path.dirname(target)) os.symlink(os.path.abspath(args.raw_bam), target) src_bai = os.path.abspath(args.raw_bam) + ".bai" if os.path.exists(src_bai): os.symlink(src_bai, target + ".bai") elif args.proc_bam: target = os.path.join(args.outdir, "out", args.sample, "{}.bwamem".format(args.sample)) if cfg_dict['mark_dups']: target += ".dedup" if cfg_dict['seqtype'] != 'targeted': target += ".bqsr" target += ".bam" os.makedirs(os.path.dirname(target)) os.symlink(os.path.abspath(args.proc_bam), target) if os.path.exists(os.path.abspath(args.proc_bam) + ".bai"): os.symlink( os.path.abspath(args.proc_bam) + ".bai", target + ".bai") pipeline_handler.submit(args.no_run)
def main(): """main function """ default_parser = default_argparser(CFG_DIR, with_readunits=True) parser = configargparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args default = 2000 parser.add_argument( "--fragment-length", type=int, default=default, help="Fragment length argument for Bowtie (default {})".format( default)) default = 200 parser.add_argument( "--extsize", type=int, default=default, help= "extsize argument for MACS2; only used for single-end reads (default {})" .format(default)) default = -100 parser.add_argument( "--shift", type=int, default=default, help= "shift argument for MACS2; only used for single-end reads (default {})" .format(default)) default = 250 parser.add_argument( "--peak-ext-bp", type=int, default=default, help="Extension around peaks for bed creation (default {})".format( default)) args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.sample_cfg: if any([args.fq1, args.fq2, args.sample]): logger.fatal( "Config file overrides fastq and sample input arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.sample_cfg): logger.fatal("Config file %s does not exist", args.sample_cfg) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile( args.sample_cfg) else: if not all([args.fq1, args.sample]): logger.fatal("Need at least fq1 and sample without config file") sys.exit(1) readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples = dict() samples[args.sample] = list(readunits.keys()) # turn arguments into cfg_dict that gets merged into pipeline config # cfg_dict = dict() cfg_dict['readunits'] = readunits cfg_dict['samples'] = samples cfg_dict['paired_end'] = any(ru.get('fq2') for ru in readunits.values()) if cfg_dict['paired_end']: assert all(ru.get('fq2') for ru in readunits.values()), ( "Can't handle mix of paired-end and single-end") cfg_dict['mapper'] = 'bowtie2' # FIXME fixed for now # cfg_dict["bowtie2_custom_args"] # cfg_dict['platform'] # cfg_dict['center'] # cfg_dict["macs2_custom_args"] cfg_dict['fragment_length'] = args.fragment_length cfg_dict['shift'] = args.shift cfg_dict['extsize'] = args.extsize cfg_dict["peak_ext_bp"] = args.peak_ext_bp pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(): """main function """ parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version())) # generic args parser.add_argument('-o', "--outdir", required=True, help="Output directory (must not exist)") parser.add_argument('--name', help="Give this analysis run a name (used in email and report)") parser.add_argument('--no-mail', action='store_true', help="Don't send mail on completion") site = get_site() default = DEFAULT_SLAVE_Q.get(site, None) parser.add_argument('-w', '--slave-q', default=default, help="Queue to use for slave jobs (default: {})".format(default)) default = DEFAULT_MASTER_Q.get(site, None) parser.add_argument('-m', '--master-q', default=default, help="Queue to use for master job (default: {})".format(default)) parser.add_argument('-n', '--no-run', action='store_true') parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") cfg_group = parser.add_argument_group('Configuration files (advanced)') cfg_group.add_argument('--prev-cfg', help="Previously used config. Also used to infer path to precalculated BAM files") for name, descr in [("references", "reference sequences"), ("params", "parameters"), ("modules", "modules")]: default = os.path.abspath(os.path.join(CFG_DIR, "{}.yaml".format(name))) cfg_group.add_argument('--{}-cfg'.format(name), default=default, help="Config-file (yaml) for {}. (default: {})".format(descr, default)) # pipeline specific args #parser.add_argument('-1', "--fq1", nargs="+", # help="FastQ file/s (gzip only)." # " Multiple input files supported (auto-sorted)." # " Note: each file (or pair) gets a unique read-group id." # " Collides with --sample-cfg.") #parser.add_argument('-2', "--fq2", nargs="+", # help="FastQ file/s (if paired) (gzip only). See also --fq1") #parser.add_argument('-s', "--sample", # help="Sample name. Collides with --sample-cfg.") #parser.add_argument('-t', "--seqtype", required=True, # choices=['WGS', 'WES', 'targeted'], # help="Sequencing type") #parser.add_argument('-l', "--intervals", # help="Intervals file (e.g. bed file) listing regions of interest." # " Required for WES and targeted sequencing.") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value #if args.sample_cfg: # if any([args.fq1, args.fq2, args.sample]): # logger.fatal("Config file overrides fastq and sample input arguments." # " Use one or the other") # sys.exit(1) # if not os.path.exists(args.sample_cfg): # logger.fatal("Config file %s does not exist", args.sample_cfg) # sys.exit(1) # samples, readunits = get_samples_and_readunits_from_cfgfile(args.sample_cfg) #else: # if not all([args.fq1, args.sample]): # logger.fatal("Need at least fq1 and sample without config file") # sys.exit(1) # # readunits = get_readunits_from_args(args.fq1, args.fq2) # # all readunits go into this one sample specified on the command-line # samples = dict() # samples[args.sample] = list(readunits.keys()) # #if args.seqtype in ['WES', 'targeted']: # if not args.intervals: # logger.fatal("Analysis of exome and targeted sequence runs requires a bed file") # sys.exit(1) # else: # if not os.path.exists(args.intervals): # logger.fatal("Intervals file %s does not exist", args.sample_cfg) # sys.exit(1) # logger.warning("Compatilibity between interval file and" # " reference not checked")# FIXME with open(args.prev_cfg, 'r') as stream: try: prev_cfg = yaml.load(stream) except yaml.YAMLError as exc: logger.fatal("Error loading %s", REST_CFG) raise #import pdb; pdb.set_trace() #sys.stderr.write("TMP DEBUG {}\n".format(prev_cfg)) # turn arguments into user_data that gets merged into pipeline config # # generic data first user_data = dict() user_data['mail_on_completion'] = not args.no_mail #user_data['readunits'] = prev_cfg['readunits'] user_data['readunits'] = dict()# None won't work #user_data['samples'] = samples user_data['samples'] = prev_cfg['samples'] if args.name: user_data['analysis_name'] = args.name #user_data['seqtype'] = args.seqtype user_data['seqtype'] = 'WGS'# SG10K #user_data['intervals'] = args.intervals# always safe, might be used for WGS as well user_data['intervals'] = None#SG10K user_data['mark_dups'] = None# SG10K doesn't matter user_data['precalc_bam_dir'] = os.path.join( os.path.abspath(os.path.dirname(args.prev_cfg)), "out") pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args.outdir, user_data, site=site, master_q=args.master_q, slave_q=args.slave_q, params_cfgfile=args.params_cfg, modules_cfgfile=args.modules_cfg, refs_cfgfile=args.references_cfg, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(): """main function """ default_parser = default_argparser(CFG_DIR, allow_missing_cfgfile=True) parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args parser.add_argument( '-1', "--fq1", nargs="+", help="FastQ file/s (gzip only)." " Multiple input files supported (auto-sorted)." " Note: each file (or pair) gets a unique read-group id." " Collides with --sample-cfg.") parser.add_argument( '-2', "--fq2", nargs="+", help="FastQ file/s (if paired) (gzip only). See also --fq1") parser.add_argument('-s', "--sample", help="Sample name. Collides with --sample-cfg.") parser.add_argument( '-r', "--reffa", required=True, help="Reference genome") # FIXME create local copy for indexing? args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.sample_cfg: if any([args.fq1, args.fq2, args.sample]): logger.fatal("Config file overrides fastq and sample arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.sample_cfg): logger.fatal("Config file %s does not exist", args.sample_cfg) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile( args.sample_cfg) else: # no sample config, so input is either fastq or existing bam samples = dict() if not args.sample: logger.fatal("Need sample name if not using config file") sys.exit(1) if args.fq1: readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples[args.sample] = list(readunits.keys()) else: logger.fatal( "Need at least one fastq files as argument if not using config file" ) sys.exit(1) for ru in readunits.values(): assert ru['fq2'], ( "FastQ R2 missing, but assemblers assume paired-end reads") # turn arguments into cfg_dict (gets merged with other configs late) # cfg_dict = dict() cfg_dict['readunits'] = readunits cfg_dict['samples'] = samples assert os.path.exists(args.reffa) # FIXME only works because yaml missing and thus not overwritten cfg_dict['references'] = {'genome': os.path.abspath(args.reffa)} pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(): """main function """ default_parser = default_argparser(CFG_DIR) parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args parser.add_argument( '-1', "--fq1", nargs="+", help="FastQ file/s (gzip only)." " Multiple input files supported (auto-sorted)." " Note: each file (or pair) gets a unique read-group id." " Collides with --sample-cfg.") parser.add_argument( '-2', "--fq2", nargs="+", help="FastQ file/s (if paired) (gzip only). See also --fq1") parser.add_argument('-s', "--sample", help="Sample name. Collides with --sample-cfg.") parser.add_argument('-C', "--cuffdiff", action='store_true', dest="run_cuffdiff", help="Also run cuffdiff") parser.add_argument('-S', '--stranded', action='store_true', help="Stranded library prep (default is unstranded)") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.sample_cfg: if any([args.fq1, args.fq2, args.sample]): logger.fatal( "Config file overrides fastq and sample input arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.sample_cfg): logger.fatal("Config file %s does not exist", args.sample_cfg) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile( args.sample_cfg) else: if not all([args.fq1, args.sample]): logger.fatal("Need at least fq1 and sample without config file") sys.exit(1) readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples = dict() samples[args.sample] = list(readunits.keys()) # FIXME checks on reffa index (currently not exposed via args) # turn arguments into cfg_dict that gets merged into pipeline config # cfg_dict = dict() cfg_dict['readunits'] = readunits cfg_dict['samples'] = samples cfg_dict['stranded'] = args.stranded cfg_dict['run_cuffdiff'] = args.run_cuffdiff cfg_dict['paired_end'] = any(ru.get('fq2') for ru in readunits.values()) if cfg_dict['paired_end']: assert all(ru.get('fq2') for ru in readunits.values()), ( "Can't handle mix of paired-end and single-end") pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(): """main function """ default_parser = default_argparser(CFG_DIR, with_readunits=True) parser = configargparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args default = ['kraken', 'metaphlan2'] parser.add_argument("-p", "--profilers", nargs='+', default=default, help="Profilers to run (default = {}".format( ", ".join(default))) args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.sample_cfg: if any([args.fq1, args.fq2, args.sample]): logger.fatal("Config file overrides fastq and sample arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.sample_cfg): logger.fatal("Config file %s does not exist", args.sample_cfg) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile( args.sample_cfg) else: # no sample config, so input is either fastq or existing bam samples = dict() if not args.sample: logger.fatal("Need sample name if not using config file") sys.exit(1) if args.fq1: readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples[args.sample] = list(readunits.keys()) else: logger.fatal( "Need at least one fastq files as argument if not using config file" ) sys.exit(1) for ru in readunits.values(): assert ru['fq2'], ( "FastQ R2 missing, but pipelines requires paired-end reads") # turn arguments into cfg_dict (gets merged with other configs late) # cfg_dict = dict() cfg_dict['readunits'] = readunits cfg_dict['samples'] = samples cfg_dict['profilers'] = args.profilers pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(): """main function """ default_parser = default_argparser(CFG_DIR, with_readunits=True) parser = configargparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args parser.add_argument('-t', "--seqtype", required=True, choices=['WGS', 'WES', 'targeted'], help="Sequencing type") parser.add_argument('-l', "--bed", help="Bed file listing regions of interest." " Required for WES and targeted sequencing.") parser.add_argument('-D', '--dont-mark-dups', action='store_true', help="Don't mark duplicate reads") # raw bam not possible because the pipeline splits on the fly into chromosomes parser.add_argument( '--proc-bam', help="Advanced: Injects processed BAM (overwrites fq options)." " WARNING: reference and pre-processing need to match pipeline requirements" ) parser.add_argument('--bam-only', action='store_true', help="Don't call variants, just process BAM file") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.sample_cfg: if any([args.fq1, args.fq2, args.sample, args.proc_bam]): logger.fatal("Config file overrides fastq and sample arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.sample_cfg): logger.fatal("Config file %s does not exist", args.sample_cfg) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile( args.sample_cfg) else: # no sample config, so input is either fastq or existing bam samples = dict() if not args.sample: logger.fatal("Need sample name if not using config file") sys.exit(1) if args.proc_bam: assert not args.fq1, ("BAM injection overwrites fastq arguments") if args.proc_bam: assert os.path.exists(args.proc_bam) readunits = dict() samples[args.sample] = [] elif args.fq1: readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples[args.sample] = list(readunits.keys()) else: logger.fatal( "Need at least one fastq files as argument if not using config file" ) sys.exit(1) if args.seqtype in ['WES', 'targeted']: if not args.bed: logger.fatal( "Analysis of exome and targeted sequence runs requires a bed file" ) sys.exit(1) else: if not os.path.exists(args.bed): logger.fatal("Bed file %s does not exist", args.sample_cfg) sys.exit(1) # turn arguments into cfg_dict (gets merged with other configs late) # cfg_dict = dict() cfg_dict['readunits'] = readunits cfg_dict['samples'] = samples cfg_dict['seqtype'] = args.seqtype cfg_dict['intervals'] = os.path.abspath( args.bed ) if args.bed else None # always safe, might be used for WGS as well cfg_dict['mark_dups'] = not args.dont_mark_dups cfg_dict['bam_only'] = args.bam_only pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() # Inject existing BAM by symlinking (everything upstream is temporary anyway) # WARNING: filename has to match definition in Snakefile! if args.proc_bam: target = os.path.join(args.outdir, "out", args.sample, "{}.bwamem.lofreq".format(args.sample)) if cfg_dict['mark_dups']: target += ".dedup" target += ".lacer.bam" os.makedirs(os.path.dirname(target)) os.symlink(os.path.abspath(args.proc_bam), target) pipeline_handler.submit(args.no_run)
def main(): """main function """ default_parser = default_argparser(CFG_DIR, with_readunits=True) parser = configargparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args #/ args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.sample_cfg: if any([args.fq1, args.fq2, args.sample]): logger.fatal( "Config file overrides fastq and sample input arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.sample_cfg): logger.fatal("Config file %s does not exist", args.sample_cfg) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile( args.sample_cfg) else: if not all([args.fq1, args.sample]): logger.fatal("Need at least fq1 and sample without config file") sys.exit(1) readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples = dict() samples[args.sample] = list(readunits.keys()) # FIXME implement checks on reffa index (currently not exposed via args) # turn arguments into cfg_dict that gets merged into pipeline config # cfg_dict = dict() cfg_dict['readunits'] = readunits cfg_dict['samples'] = samples pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(): """main function """ parser = argparse.ArgumentParser( description=__doc__.format(PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()) ) # generic args parser.add_argument("-o", "--outdir", required=True, help="Output directory (may not exist)") parser.add_argument("--name", help="Give this analysis run a name (used in email and report)") parser.add_argument("--no-mail", action="store_true", help="Don't send mail on completion") # site = get_site() default = get_default_queue("slave") parser.add_argument( "-w", "--slave-q", default=default, help="Queue to use for slave jobs (default: {})".format(default) ) default = get_default_queue("master") parser.add_argument( "-m", "--master-q", default=default, help="Queue to use for master job (default: {})".format(default) ) parser.add_argument("-n", "--no-run", action="store_true") parser.add_argument("-v", "--verbose", action="count", default=0, help="Increase verbosity") parser.add_argument("-q", "--quiet", action="count", default=0, help="Decrease verbosity") cfg_group = parser.add_argument_group("Configuration files (advanced)") cfg_group.add_argument( "--sample-cfg", help="Config-file (YAML) listing samples and readunits." " Collides with -1, -2 and -s" ) for name, descr in [("references", "reference sequences"), ("params", "parameters"), ("modules", "modules")]: default = os.path.abspath(os.path.join(CFG_DIR, "{}.yaml".format(name))) cfg_group.add_argument( "--{}-cfg".format(name), default=default, help="Config-file (yaml) for {}. (default: {})".format(descr, default), ) # pipeline specific args parser.add_argument( "--normal-fq1", nargs="+", help="Normal FastQ file/s (gzip only)." " Multiple input files supported (auto-sorted)." " Note: each file (or pair) gets a unique read-group id." " Collides with --sample-cfg.", ) parser.add_argument("--normal-fq2", nargs="+", help="Normal FastQ file/s (if paired) (gzip only). See also --fq1") parser.add_argument( "--tumor-fq1", nargs="+", help="Tumor FastQ file/s (gzip only)." " Multiple input files supported (auto-sorted)." " Note: each file (or pair) gets a unique read-group id." " Collides with --sample-cfg.", ) parser.add_argument("--tumor-fq2", nargs="+", help="Tumor FastQ file/s (if paired) (gzip only). See also --fq1") parser.add_argument("-t", "--seqtype", required=True, choices=["WGS", "WES", "targeted"], help="Sequencing type") parser.add_argument( "-l", "--intervals", help="Intervals file (e.g. bed file) listing regions of interest." " Required for WES and targeted sequencing.", ) parser.add_argument("-D", "--dont-mark-dups", action="store_true", help="Don't mark duplicate reads") parser.add_argument( "--normal-bam", help="Advanced: Injects normal BAM (overwrites normal-fq options)." " WARNING: reference and postprocessing need to match pipeline requirements", ) parser.add_argument( "--tumor-bam", help="Advanced: Injects tumor BAM (overwrites tumor-fq options)." " WARNING: reference and postprocessing need to match pipeline requirements", ) args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.sample_cfg: if any([args.normal_fq1, args.normal_fq2, args.tumor_fq1, args.tumor_fq2, args.normal_bam, args.tumor_bam]): logger.fatal("Config file overrides fastq and sample input arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.sample_cfg): logger.fatal("Config file %s does not exist", args.sample_cfg) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile(args.sample_cfg) else: samples = dict() if args.normal_bam: normal_readunits = dict() samples["normal"] = [] assert os.path.exists(args.normal_bam) else: if not all([args.normal_fq1, args.tumor_fq1]): logger.fatal("Need at least fq1 and sample without config file") sys.exit(1) normal_readunits = get_readunits_from_args(args.normal_fq1, args.normal_fq2) samples["normal"] = list(normal_readunits.keys()) if args.tumor_bam: tumor_readunits = dict() samples["tumor"] = [] assert os.path.exists(args.tumor_bam) else: tumor_readunits = get_readunits_from_args(args.tumor_fq1, args.tumor_fq2) samples["tumor"] = list(tumor_readunits.keys()) readunits = dict(normal_readunits) readunits.update(tumor_readunits) assert sorted(samples) == sorted(["normal", "tumor"]) # FIXME howt to # if not os.path.exists(reffa): # logger.fatal("Reference '%s' doesn't exist", reffa) # sys.exit(1) # # for p in ['bwa', 'samtools']: # if not ref_is_indexed(reffa, p): # logger.fatal("Reference '%s' doesn't appear to be indexed with %s", reffa, p) # sys.exit(1) if args.seqtype in ["WES", "targeted"]: if not args.intervals: logger.fatal("Analysis of exome and targeted sequence runs requires a bed file") sys.exit(1) else: if not os.path.exists(args.intervals): logger.fatal("Intervals file %s does not exist", args.sample_cfg) sys.exit(1) logger.warning("Compatilibity between interval file and" " reference not checked") # FIXME # turn arguments into user_data that gets merged into pipeline config # # generic data first user_data = dict() user_data["mail_on_completion"] = not args.no_mail user_data["readunits"] = readunits user_data["samples"] = samples if args.name: user_data["analysis_name"] = args.name user_data["seqtype"] = args.seqtype user_data["intervals"] = args.intervals user_data["mark_dups"] = not args.dont_mark_dups pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args.outdir, user_data, master_q=args.master_q, slave_q=args.slave_q, params_cfgfile=args.params_cfg, modules_cfgfile=args.modules_cfg, refs_cfgfile=args.references_cfg, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR), ) pipeline_handler.setup_env() # inject existing BAM by symlinking (everything upstream is temporary anyway) for sample, bam in [("normal", args.normal_bam), ("tumor", args.tumor_bam)]: if bam: # target as defined in Snakefile! target = os.path.join(args.outdir, "out", sample, "{}.bwamem.lofreq.dedup.lacer.bam".format(sample)) os.makedirs(os.path.dirname(target)) os.symlink(os.path.abspath(bam), target) pipeline_handler.submit(args.no_run)
def main(): """main function """ # FIXME ugly and code duplication in bcl2fastq_dbupdate.py mongo_status_script = os.path.abspath(os.path.join( os.path.dirname(sys.argv[0]), "mongo_status.py")) assert os.path.exists(mongo_status_script) parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version())) parser.add_argument('-r', "--runid", help="Run ID plus flowcell ID (clashes with -d)") parser.add_argument('-d', "--rundir", help="BCL input directory (clashes with -r)") parser.add_argument('-o', "--outdir", help="Output directory (must not exist; required if called by user)") parser.add_argument('-t', "--testing", action='store_true', help="Use MongoDB test server") parser.add_argument('--no-archive', action='store_true', help="Don't archieve this analysis") parser.add_argument('--no-mail', action='store_true', help="Don't send mail on completion") site = get_site() default = DEFAULT_SLAVE_Q.get(site, None) parser.add_argument('-w', '--slave-q', default=default, help="Queue to use for slave jobs (default: {})".format(default)) default = DEFAULT_MASTER_Q.get(site, None) parser.add_argument('-m', '--master-q', default=default, help="Queue to use for master job (default: {})".format(default)) parser.add_argument('-l', '--lanes', type=int, nargs="*", help="Limit run to given lane/s (multiples separated by space") parser.add_argument('-i', '--mismatches', type=int, help="Max. number of allowed barcode mismatches (0>=x<=2)" " setting a value here overrides the default settings read from ELM)") parser.add_argument('-n', '--no-run', action='store_true') parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) if args.mismatches is not None: if args.mismatches > 2 or args.mismatches < 0: logger.fatal("Number of mismatches must be between 0-2") sys.exit(1) lane_info = '' lane_nos = [] if args.lanes: lane_info = '--tiles ' for lane in args.lanes: if lane > 8 or lane < 1: logger.fatal("Lane number must be between 1-8") sys.exit(1) else: lane_info += 's_{}'.format(lane)+',' lane_info = lane_info.rstrip() lane_info = lane_info[:-1] lane_nos = list(args.lanes) if args.runid and args.rundir: logger.fatal("Cannot use run-id and input directory arguments simultaneously") sys.exit(1) elif args.runid: rundir = run_folder_for_run_id(args.runid) elif args.rundir: rundir = os.path.abspath(args.rundir) else: logger.fatal("Need either run-id or input directory") sys.exit(1) if not os.path.exists(rundir): logger.fatal("Expected run directory {} does not exist".format(rundir)) logger.info("Rundir is {}".format(rundir)) if not args.outdir: outdir = get_bcl2fastq_outdir(args.runid) else: outdir = args.outdir if os.path.exists(outdir): logger.fatal("Output directory %s already exists", outdir) sys.exit(1) # create now so that generate_bcl2fastq_cfg.py can run os.makedirs(outdir) # catch cases where rundir was user provided and looks weird try: _, runid, flowcellid = get_machine_run_flowcell_id(rundir) run_num = runid + "_" + flowcellid except: run_num = "UNKNOWN-" + rundir.split("/")[-1] # call generate_bcl2fastq_cfg # # FIXME ugly assumes same directory (just like import above). better to import and run main()? generate_bcl2fastq = os.path.join( os.path.dirname(sys.argv[0]), "generate_bcl2fastq_cfg.py") assert os.path.exists(generate_bcl2fastq) cmd = [generate_bcl2fastq, '-r', rundir, '-o', outdir] if args.testing: cmd.append("-t") logger.debug("Executing {}".format(' ' .join(cmd))) try: res = subprocess.check_output(cmd, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: logger.fatal("The following command failed with return code {}: {}".format( e.returncode, ' '.join(cmd))) logger.fatal("Output: {}".format(e.output.decode())) logger.fatal("Exiting") sys.exit(1) # generate_bcl2fastq is normally quiet. if there's output, make caller aware of it # use sys instead of logger to avoid double logging if res: sys.stderr.write(res.decode()) # just created files muxinfo_cfg = os.path.join(outdir, MUXINFO_CFG) samplesheet_csv = os.path.join(outdir, SAMPLESHEET_CSV) usebases_cfg = os.path.join(outdir, USEBASES_CFG) # NOTE: signal for failed runs is exit 0 from generate_bcl2fastq and missing output files # if any([not os.path.exists(x) for x in [muxinfo_cfg, samplesheet_csv, usebases_cfg]]): # one missing means all should be missing assert all([not os.path.exists(x) for x in [muxinfo_cfg, samplesheet_csv, usebases_cfg]]) seqrunfailed(mongo_status_script, run_num, outdir, args.testing) sys.exit(0) # turn arguments into user_data that gets merged into pipeline config user_data = {'rundir': rundir, 'lanes_arg': lane_info, 'samplesheet_csv': samplesheet_csv, 'no_archive': args.no_archive, 'mail_on_completion': not args.no_mail, 'run_num': run_num} usebases_arg = '' with open(usebases_cfg, 'r') as stream: try: d = yaml.load(stream) assert 'usebases' in d assert len(d) == 1# make sure usebases is only key for ub in d['usebases']: #print (ub) usebases_arg += '--use-bases-mask {} '.format(ub) #user_data = {'usebases_arg' : usebases_arg} except yaml.YAMLError as exc: logger.fatal(exc) raise user_data['usebases_arg'] = usebases_arg os.unlink(usebases_cfg) mux_units = get_mux_units_from_cfgfile(muxinfo_cfg, lane_nos) if args.mismatches is not None: mux_units = [mu._replace(barcode_mismatches=args.mismatches) for mu in mux_units] os.unlink(muxinfo_cfg) user_data['units'] = dict() for mu in mux_units: # special case: mux split across multiple lanes. make lanes a list # and add in extra lanes if needed. k = mu.mux_dir mu_dict = dict(mu._asdict()) user_data['units'][k] = mu_dict # create mongodb update command, used later, after queueing mongo_update_cmd = "{} -r {} -s STARTED".format(mongo_status_script, user_data['run_num']) mongo_update_cmd += " -a $ANALYSIS_ID -o {}".format(outdir)# set in run.sh if args.testing: mongo_update_cmd += " -t" # NOTE: bcl2fastq has a special run template, so we need to # interfer with the default pipeline_handler. plenty of # opportunity to shoot yourself in the foot pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, outdir, user_data, site=site, master_q=args.master_q, slave_q=args.slave_q) # use local run template pipeline_handler.run_template = os.path.join( PIPELINE_BASEDIR, "run.template.{}.sh".format(pipeline_handler.site)) assert os.path.exists(pipeline_handler.run_template) pipeline_handler.setup_env() # final mongo update line in run_out tmp_run_out = pipeline_handler.run_out + ".tmp" with open(pipeline_handler.run_out) as fh_in, \ open(tmp_run_out, 'w') as fh_out: for line in fh_in: line = line.replace("@MONGO_UPDATE_CMD@", mongo_update_cmd) fh_out.write(line) shutil.move(tmp_run_out, pipeline_handler.run_out) pipeline_handler.submit(args.no_run)
def main(): """main function """ default_parser = default_argparser(CFG_DIR) parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args parser.add_argument('-1', "--fq1", nargs="+", help="FastQ file/s (gzip only)." " Multiple input files supported (auto-sorted)." " Note: each file (or pair) gets a unique read-group id." " Collides with --sample-cfg.") parser.add_argument('-2', "--fq2", nargs="+", help="FastQ file/s (if paired) (gzip only). See also --fq1") parser.add_argument('-s', "--sample", help="Sample name. Collides with --sample-cfg.") parser.add_argument('-D', '--dont-mark-dups', action='store_true', help="Don't mark duplicate reads") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.sample_cfg: if any([args.fq1, args.fq2, args.sample]): logger.fatal("Config file overrides fastq and sample input arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.sample_cfg): logger.fatal("Config file %s does not exist", args.sample_cfg) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile(args.sample_cfg) else: if not all([args.fq1, args.sample]): logger.fatal("Need at least fq1 and sample without config file") sys.exit(1) readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples = dict() samples[args.sample] = list(readunits.keys()) # turn arguments into cfg_dict that gets merged into pipeline config # cfg_dict = dict() cfg_dict['readunits'] = readunits cfg_dict['samples'] = samples cfg_dict['mark_dups'] = not args.dont_mark_dups # create mongodb update command, used later, after submission #mongo_update_cmd = "true"{} -r {} -s STARTED".format(mongo_status_script, cfg_dict['run_num']) #mongo_update_cmd += " -a $ANALYSIS_ID -o {}".format(outdir)# set in run.sh #if args.testing: # mongo_update_cmd += " -t" pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(): """main function """ default_parser = default_argparser(CFG_DIR, with_readunits=True) parser = configargparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args #/ parser.add_argument('-c', "--cell-barcodes", required=True, help="File listing cell barcodes") d = 200 parser.add_argument( "--frag-len", default=d, type=int, help="Estimated fragment length (default={})".format(d)) d = 20.0 parser.add_argument( '--frag-len-sd', default=d, type=float, help="Estimated fragment length standard deviation (default={})". format(d)) parser.add_argument( '--dedup', action="store_true", help="Run UMI-based deduplication (slow for large data-sets!)") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.sample_cfg: if any([args.fq1, args.fq2, args.sample]): logger.fatal( "Config file overrides fastq and sample input arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.sample_cfg): logger.fatal("Config file '%s' does not exist", args.sample_cfg) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile( args.sample_cfg) else: if not all([args.fq1, args.sample]): logger.fatal("Need at least fq1 and sample without config file") sys.exit(1) readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples = dict() samples[args.sample] = list(readunits.keys()) for ru in readunits.values(): assert ru['fq2'], ( "FastQ R2 missing, but pipeline requires paired-end reads") # turn arguments into cfg_dict that gets merged into pipeline config # cfg_dict = dict() cfg_dict['readunits'] = readunits cfg_dict['samples'] = samples if not os.path.exists(args.cell_barcodes): logger.fatal("Cellular barcodes file '%s' does not exist", args.cell_barcodes) sys.exit(1) cfg_dict['cell_barcodes'] = os.path.abspath(args.cell_barcodes) cfg_dict['frag_len'] = args.frag_len cfg_dict['frag_len_sd'] = args.frag_len_sd cfg_dict['no_dedup'] = not args.dedup cfg_dict['scrnapipe_transform'] = os.path.abspath( os.path.join(PIPELINE_BASEDIR, 'aux/transform.json')) cfg_dict['scrna_conf_template'] = os.path.abspath( os.path.join(PIPELINE_BASEDIR, 'aux/scrna.conf.template')) cfg_dict['adapters'] = os.path.abspath( os.path.join(PIPELINE_BASEDIR, 'aux/adapters.fa')) pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)