def run_folder_for_run_id(runid_and_flowcellid, site=None, basedir_map=SEQDIR_BASE): """runid has to contain flowcell id AKA $RAWSEQDIR run_folder_for_run_id('HS004-PE-R00139_BC6A7HANXX') >>> "/mnt/seq/userrig/HS004/HS004-PE-R00139_BC6A7HANXX" if machineid eq MS00 """ if not site: site = get_site() if site not in basedir_map: raise ValueError(site) basedir = basedir_map[site] machineid, runid, flowcellid = get_machine_run_flowcell_id( runid_and_flowcellid) if machineid.startswith('MS00'): # FIXME untested and unclear for NSCC rundir = "{}/{}/MiSeqOutput/{}_{}".format(basedir, machineid, runid, flowcellid) else: rundir = "{}/{}/{}_{}".format(basedir, machineid, runid, flowcellid) return rundir
def get_sample_info(child, rows, mux_analysis_list, mux_id, fastq_data_dir, \ run_num_flowcell, sample_info): """Collects sample info from ELM JOSN """ sample_cfg = {} site = get_site() ctime, _ = generate_window(1) _, _, flowcellid = get_machine_run_flowcell_id(run_num_flowcell) mux_analysis_list.add(mux_id) sample_id = child['libraryId'] sample_cfg['requestor'] = rows['requestor'] sample_cfg['ctime'] = ctime sample_cfg['site'] = site try: sample_cfg['pipeline_name'] = legacy_mapper['pipeline_mapper'][ child['Analysis']] except KeyError as e: sample_cfg['pipeline_name'] = child['Analysis'] logger.warning(str(e) + " Pipeline not mappped to newer version") return sample_info pipeline_version = get_pipeline_version(child['pipeline_version'] \ if 'pipeline_version' in rows else None) sample_cfg['pipeline_version'] = pipeline_version #sample_cfg['pipeline_params'] = 'params' ref_info = get_reference_info(child['Analysis'], \ sample_cfg['pipeline_version'], child['genome']) if not ref_info: logger.info("ref_info not available") return sample_info cmdline_info = get_cmdline_info(child) sample_cfg['references_cfg'] = ref_info if cmdline_info: sample_cfg['cmdline'] = cmdline_info readunits_dict = {} status, fq1, fq2 = check_fastq(fastq_data_dir, child['libraryId'],\ rows['laneId']) if status: ru = ReadUnit(run_num_flowcell, flowcellid, child['libraryId'],\ rows['laneId'], None, fq1, fq2) k = key_for_readunit(ru) readunits_dict[k] = dict(ru._asdict()) sample_cfg['readunits'] = readunits_dict if sample_info.get(sample_id, {}).get('readunits', {}): sample_info[sample_id]['readunits'].update(readunits_dict) else: sample_info[sample_id] = sample_cfg return sample_info
def mongodb_conn(use_test_server=False): """Return connection to MongoDB server""" site = get_site() assert site in mongo_conns if use_test_server: logger.info("Using test MongoDB server") constr = mongo_conns[site]['test'] else: logger.info("Using production MongoDB server") constr = mongo_conns[site]['production'] try: connection = pymongo.MongoClient(constr) except pymongo.errors.ConnectionFailure: logger.fatal("Could not connect to the MongoDB server") return None logger.debug("Database connection established") return connection
def get_downstream_outdir(requestor, pipeline_version, pipeline_name, site=None, basedir_map=OUTDIR_BASE, base_pipelinedir_map=PRODUCTION_PIPELINE_VERSION): """generate downstream output directory """ if not site: site = get_site() if site not in basedir_map: raise ValueError(site) if site not in base_pipelinedir_map: raise ValueError(site) if is_devel_version(): basedir = basedir_map[site]['devel'] if not pipeline_version: pipeline_version = base_pipelinedir_map[site]['devel'] else: basedir = basedir_map[site]['production'] if not pipeline_version: pipeline_version = os.readlink(base_pipelinedir_map[site]['production']) outdir = "{basedir}/{requestor}/{pversion}/{pname}/{ts}".format( basedir=basedir, requestor=requestor, pversion=pipeline_version, pname=pipeline_name, ts=generate_timestamp()) return outdir
def get_bcl2fastq_outdir(runid_and_flowcellid, site=None, basedir_map=OUTDIR_BASE): """FIXME:add-doc """ if not site: site = get_site() if site not in basedir_map: raise ValueError(site) if is_devel_version(): basedir = basedir_map[site]['devel'] else: basedir = basedir_map[site]['production'] machineid, runid, flowcellid = get_machine_run_flowcell_id( runid_and_flowcellid) outdir = "{basedir}/{mid}/{rid}_{fid}/bcl2fastq_{ts}".format( basedir=basedir, mid=machineid, rid=runid, fid=flowcellid, ts=generate_timestamp()) return outdir
def get_reference_info(analysis, pipeline_version, ref, site=None): """reference yaml for each library """ if not site: site = get_site() basedir = legacy_mapper['cronjob_base'][site] if ref == 'human_g1k_v37': ref = 'b37' try: new_analysis = legacy_mapper['pipeline_mapper'][analysis] except KeyError as e: logger.warning(str(e) + " Pipeline not mappped to newer version") return None ref_info = glob.glob(os.path.join(basedir, pipeline_version, \ new_analysis, 'cfg', '*' + ref+'*.yaml')) if ref_info: with open(ref_info[0], 'r') as f: try: doc = yaml.load(f) return doc except yaml.YAMLError as exc: print(exc) return None
def main(): """main function """ parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version())) # generic args parser.add_argument('-c', "--config", help="Config file (YAML) listing samples and readunits." " Collides with -1, -2 and -s") parser.add_argument('-o', "--outdir", required=True, help="Output directory (must not exist)") parser.add_argument('--no-mail', action='store_true', help="Don't send mail on completion") site = get_site() default = DEFAULT_SLAVE_Q.get(site, None) parser.add_argument('-w', '--slave-q', default=default, help="Queue to use for slave jobs (default: {})".format(default)) default = DEFAULT_MASTER_Q.get(site, None) parser.add_argument('-m', '--master-q', default=default, help="Queue to use for master job (default: {})".format(default)) parser.add_argument('-n', '--no-run', action='store_true') parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") # pipeline specific args parser.add_argument('-1', "--fq1", nargs="+", help="FastQ file/s (gzip only)." " Multiple input files supported (auto-sorted)." " Note: each file (or pair) gets a unique read-group id." " Collides with -c.") parser.add_argument('-2', "--fq2", nargs="+", help="FastQ file/s (if paired) (gzip only). See also --fq1") parser.add_argument('-s', "--sample", help="Sample name. Collides with -c.") fake_pipeline_handler = PipelineHandler("FAKE", PIPELINE_BASEDIR, "FAKE", None) default_cfg = fake_pipeline_handler.read_default_config() default = default_cfg['references']['genome'] parser.add_argument('-r', "--reffa", default=default, help=argparse.SUPPRESS) # WARN do not change. this is just to set args.reffa (used later). # any change here would require changes in dbsnp, hapmap, g1k, omni and mills as well parser.add_argument('-t', "--seqtype", required=True, choices=['WGS', 'WES', 'targeted'], help="Sequencing type") parser.add_argument('-l', "--intervals", help="Intervals file (e.g. bed file) listing regions of interest." " Required for WES and targeted sequencing.") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.config: if any([args.fq1, args.fq2, args.sample]): logger.fatal("Config file overrides fastq and sample input arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.config): logger.fatal("Config file %s does not exist", args.config) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile(args.config) else: if not all([args.fq1, args.sample]): logger.fatal("Need at least fq1 and sample without config file") sys.exit(1) readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples = dict() samples[args.sample] = list(readunits.keys()) if args.seqtype in ['WES', 'targeted']: if not args.intervals: logger.fatal("Analysis of exome and targeted sequence runs requires a bed file") sys.exit(1) else: if not os.path.exists(args.intervals): logger.fatal("Intervals file %s does not exist", args.config) sys.exit(1) logger.warning("Compatilibity between interval file and" " reference not checked")# FIXME # turn arguments into user_data that gets merged into pipeline config # # generic data first user_data = dict() user_data['mail_on_completion'] = not args.no_mail user_data['readunits'] = readunits user_data['samples'] = samples user_data['num_chroms'] = len(list(chroms_and_lens_from_from_fasta(args.reffa))) user_data['seqtype'] = args.seqtype user_data['intervals'] = args.intervals# always safe, might be used for WGS as well user_data['mark_dups'] = MARK_DUPS pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args.outdir, user_data, site=site, master_q=args.master_q, slave_q=args.slave_q) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def get_lib_details(run_num_flowcell, mux_list, testing): """Lib info collection from ELM per run """ _, run_num, flowcellid = get_machine_run_flowcell_id(run_num_flowcell) # Call rest service to get component libraries if testing: print(run_num) rest_url = rest_services['run_details']['testing'].replace("run_num", run_num) logger.info("development server") else: rest_url = rest_services['run_details']['production'].replace("run_num", run_num) logger.info("production server") response = requests.get(rest_url) if response.status_code != requests.codes.ok: response.raise_for_status() rest_data = response.json() logger.debug("rest_data from %s: %s", rest_url, rest_data) sample_info = {} if rest_data.get('runId') is None: logger.info("JSON data is empty for run num %s", run_num) return sample_info for mux_id, out_dir in mux_list: fastq_data_dir = os.path.join(out_dir[0], 'out', "Project_"+mux_id) if os.path.exists(fastq_data_dir): for rows in rest_data['lanes']: if mux_id in rows['libraryId']: if "MUX" in rows['libraryId']: for child in rows['Children']: if child['Analysis'] != "Sequence only": ctime, _ = generate_window(1) sample_dict = {} sample = child['libraryId'] sample_dict['requestor'] = rows['requestor'] sample_dict['ctime'] = ctime sample_dict['pipeline_name'] = child['Analysis'] if 'pipeline_version' in rows: sample_dict['pipeline_version'] = child['pipeline_version'] else: sample_dict['pipeline_version'] = None sample_dict['pipeline_params'] = 'params' sample_dict['site'] = get_site() out_dir = get_downstream_outdir(sample_dict['requestor'], \ sample_dict['pipeline_version'], sample_dict['pipeline_name']) sample_dict['out_dir'] = out_dir readunits_dict = {} status, fq1, fq2 = check_fastq(fastq_data_dir, child['libraryId'],\ rows['laneId']) if status: ru = ReadUnit(run_num_flowcell, flowcellid, child['libraryId'],\ rows['laneId'], None, fq1, fq2) k = key_for_read_unit(ru) readunits_dict[k] = dict(ru._asdict()) sample_dict['readunits'] = readunits_dict if sample_info.get(sample, {}).get('readunits'): sample_info[sample]['readunits'].update(readunits_dict) else: sample_info[sample] = sample_dict else: if rows['Analysis'] != "Sequence only": sample = rows['libraryId'] status, fq1, fq2 = check_fastq(fastq_data_dir, rows['libraryId'], \ rows['laneId']) if status: ctime, _ = generate_window(1) sample_dict = {} readunits_dict = {} ru = ReadUnit(run_num_flowcell, flowcellid, rows['libraryId'], \ rows['laneId'], None, fq1, fq2) k = key_for_read_unit(ru) readunits_dict[k] = dict(ru._asdict()) sample_dict['readunits'] = readunits_dict sample_info[sample] = sample_dict return sample_info
def main(): """main function """ parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version())) # generic args parser.add_argument('-c', "--config", help="Config file (YAML) listing samples and readunits." " Collides with -1, -2 and -s") parser.add_argument('-o', "--outdir", required=True, help="Output directory (must not exist)") parser.add_argument('--no-mail', action='store_true', help="Don't send mail on completion") site = get_site() default = DEFAULT_SLAVE_Q.get(site, None) parser.add_argument('-w', '--slave-q', default=default, help="Queue to use for slave jobs (default: {})".format(default)) default = DEFAULT_MASTER_Q.get(site, None) parser.add_argument('-m', '--master-q', default=default, help="Queue to use for master job (default: {})".format(default)) parser.add_argument('-n', '--no-run', action='store_true') parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") # pipeline specific args parser.add_argument('-1', "--fq1", nargs="+", help="FastQ file/s (gzip only)." " Multiple input files supported (auto-sorted)." " Note: each file (or pair) gets a unique read-group id." " Collides with -c.") parser.add_argument('-2', "--fq2", nargs="+", help="FastQ file/s (if paired) (gzip only). See also --fq1") parser.add_argument('-s', "--sample", help="Sample name. Collides with -c.") parser.add_argument('-C', "--cuffdiff", action='store_true', dest="run_cuffdiff", help="Also run cuffdiff") parser.add_argument('-S', '--stranded', action='store_true', help="Stranded library prep (default is unstranded)") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value if args.config: if any([args.fq1, args.fq2, args.sample]): logger.fatal("Config file overrides fastq and sample input arguments." " Use one or the other") sys.exit(1) if not os.path.exists(args.config): logger.fatal("Config file %s does not exist", args.config) sys.exit(1) samples, readunits = get_samples_and_readunits_from_cfgfile(args.config) else: if not all([args.fq1, args.sample]): logger.fatal("Need at least fq1 and sample without config file") sys.exit(1) readunits = get_readunits_from_args(args.fq1, args.fq2) # all readunits go into this one sample specified on the command-line samples = dict() samples[args.sample] = list(readunits.keys()) # FIXME checks on reffa index (currently not exposed via args) # turn arguments into user_data that gets merged into pipeline config # # generic data first user_data = dict() user_data['mail_on_completion'] = not args.no_mail user_data['readunits'] = readunits user_data['samples'] = samples user_data['stranded'] = args.stranded user_data['run_cuffdiff'] = args.run_cuffdiff user_data['paired_end'] = any(ru.get('fq2') for ru in readunits.values()) if user_data['paired_end']: assert all(ru.get('fq2') for ru in readunits.values()), ( "Can't handle mix of paired-end and single-end") pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args.outdir, user_data, site=site, master_q=args.master_q, slave_q=args.slave_q) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(): """main function """ parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( '-n', "--dryrun", action='store_true', help="Don't actually update DB (best used in conjunction with -v -v)") parser.add_argument('-t', "--testing", action='store_true', help="Use MongoDB test-server. Don't do anything") default = 14 parser.add_argument( '-w', '--win', type=int, default=default, help="Number of days to look back (default {})".format(default)) parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every LOGGER.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if not is_production_user(): LOGGER.warning("Not a production user. Exiting") sys.exit(1) connection = mongodb_conn(args.testing) if connection is None: sys.exit(1) #LOGGER.info("Database connection established") dbcol = connection.gisds.pipeline_runs site = get_site() epoch_now, epoch_then = generate_window(args.win) cursor = dbcol.find({ "ctime": { "$gt": epoch_then, "$lt": epoch_now }, "site": site }) LOGGER.info("Looping through {} jobs".format(cursor.count())) for job in cursor: dbid = job['_id'] # only set here to avoid code duplication below try: out_dir = job['execution']['out_dir'] except KeyError: out_dir = None # no execution dict means start a new analysis if not job.get('execution'): LOGGER.info('Job {} to be started'.format(dbid)) # determine out_dir and set in DB # out_dir_override will take precedence over generating out_dir with get_downstream_outdir function if job.get('out_dir_override'): out_dir = job.get('out_dir_override') if os.path.exists(out_dir): mux = os.path.basename(out_dir) if not args.dryrun: LOGGER.critical( "Analysis for {} already exists under {}. Please start the analysis manually" .format(mux, out_dir)) res = dbcol.update_one( {"_id": ObjectId(dbid)}, {"$set": { "execution.status": "MANUAL" }}) assert res.modified_count == 1, ( "Modified {} documents instead of 1".format( res.modified_count)) sys.exit(1) #assert not os.path.exists(out_dir), ("Direcotry already exists {}").format(out_dir) else: out_dir = get_downstream_outdir(job['requestor'], job['pipeline_name'], job['pipeline_version']) # Note, since execution (key) exists, accidental double # starts are prevented even before start time etc is # logged via flagfiles. No active logging here so that # flag files logging just works. if args.dryrun: LOGGER.info("Skipping dry run option") continue status = start_cmd_execution(job, site, out_dir, args.testing) if status: res = dbcol.update_one( {"_id": ObjectId(dbid)}, {"$set": { "execution.out_dir": out_dir }}) assert res.modified_count == 1, ( "Modified {} documents instead of 1".format( res.modified_count)) else: LOGGER.warning("Job {} could not be started".format(dbid)) elif job['execution'].get('status') == "MANUAL": continue elif list_starterflags( out_dir ): # out_dir cannot be none because it's part of execution dict LOGGER.info( 'Job {} in {} started but not yet logged as such in DB'.format( dbid, out_dir)) matches = list_starterflags(out_dir) assert len(matches) == 1, ( "Got several starter flags in {}".format(out_dir)) sflag = StarterFlag(matches[0]) assert sflag.dbid == str(dbid) set_started(dbcol, sflag.dbid, str(sflag.timestamp), dryrun=args.dryrun) os.unlink(sflag.filename) elif job['execution'].get('status') in ['STARTED', 'RESTART']: LOGGER.info( 'Job %s in %s set as re|started so checking on completion', dbid, out_dir) set_completion_if(dbcol, dbid, out_dir, dryrun=args.dryrun) else: # job complete LOGGER.debug('Job %s in %s should be completed', dbid, out_dir) LOGGER.info("Successful program exit")
def main(): """main function """ # FIXME ugly and code duplication in bcl2fastq_dbupdate.py mongo_status_script = os.path.abspath(os.path.join( os.path.dirname(sys.argv[0]), "mongo_status.py")) assert os.path.exists(mongo_status_script) parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version())) parser.add_argument('-r', "--runid", help="Run ID plus flowcell ID (clashes with -d)") parser.add_argument('-d', "--rundir", help="BCL input directory (clashes with -r)") parser.add_argument('-o', "--outdir", help="Output directory (must not exist; required if called by user)") parser.add_argument('-t', "--testing", action='store_true', help="Use MongoDB test server") parser.add_argument('--no-archive', action='store_true', help="Don't archieve this analysis") parser.add_argument('--no-mail', action='store_true', help="Don't send mail on completion") site = get_site() default = DEFAULT_SLAVE_Q.get(site, None) parser.add_argument('-w', '--slave-q', default=default, help="Queue to use for slave jobs (default: {})".format(default)) default = DEFAULT_MASTER_Q.get(site, None) parser.add_argument('-m', '--master-q', default=default, help="Queue to use for master job (default: {})".format(default)) parser.add_argument('-l', '--lanes', type=int, nargs="*", help="Limit run to given lane/s (multiples separated by space") parser.add_argument('-i', '--mismatches', type=int, help="Max. number of allowed barcode mismatches (0>=x<=2)" " setting a value here overrides the default settings read from ELM)") parser.add_argument('-n', '--no-run', action='store_true') parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) if args.mismatches is not None: if args.mismatches > 2 or args.mismatches < 0: logger.fatal("Number of mismatches must be between 0-2") sys.exit(1) lane_info = '' lane_nos = [] if args.lanes: lane_info = '--tiles ' for lane in args.lanes: if lane > 8 or lane < 1: logger.fatal("Lane number must be between 1-8") sys.exit(1) else: lane_info += 's_{}'.format(lane)+',' lane_info = lane_info.rstrip() lane_info = lane_info[:-1] lane_nos = list(args.lanes) if args.runid and args.rundir: logger.fatal("Cannot use run-id and input directory arguments simultaneously") sys.exit(1) elif args.runid: rundir = run_folder_for_run_id(args.runid) elif args.rundir: rundir = os.path.abspath(args.rundir) else: logger.fatal("Need either run-id or input directory") sys.exit(1) if not os.path.exists(rundir): logger.fatal("Expected run directory {} does not exist".format(rundir)) logger.info("Rundir is {}".format(rundir)) if not args.outdir: outdir = get_bcl2fastq_outdir(args.runid) else: outdir = args.outdir if os.path.exists(outdir): logger.fatal("Output directory %s already exists", outdir) sys.exit(1) # create now so that generate_bcl2fastq_cfg.py can run os.makedirs(outdir) # catch cases where rundir was user provided and looks weird try: _, runid, flowcellid = get_machine_run_flowcell_id(rundir) run_num = runid + "_" + flowcellid except: run_num = "UNKNOWN-" + rundir.split("/")[-1] # call generate_bcl2fastq_cfg # # FIXME ugly assumes same directory (just like import above). better to import and run main()? generate_bcl2fastq = os.path.join( os.path.dirname(sys.argv[0]), "generate_bcl2fastq_cfg.py") assert os.path.exists(generate_bcl2fastq) cmd = [generate_bcl2fastq, '-r', rundir, '-o', outdir] if args.testing: cmd.append("-t") logger.debug("Executing {}".format(' ' .join(cmd))) try: res = subprocess.check_output(cmd, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: logger.fatal("The following command failed with return code {}: {}".format( e.returncode, ' '.join(cmd))) logger.fatal("Output: {}".format(e.output.decode())) logger.fatal("Exiting") sys.exit(1) # generate_bcl2fastq is normally quiet. if there's output, make caller aware of it # use sys instead of logger to avoid double logging if res: sys.stderr.write(res.decode()) # just created files muxinfo_cfg = os.path.join(outdir, MUXINFO_CFG) samplesheet_csv = os.path.join(outdir, SAMPLESHEET_CSV) usebases_cfg = os.path.join(outdir, USEBASES_CFG) # NOTE: signal for failed runs is exit 0 from generate_bcl2fastq and missing output files # if any([not os.path.exists(x) for x in [muxinfo_cfg, samplesheet_csv, usebases_cfg]]): # one missing means all should be missing assert all([not os.path.exists(x) for x in [muxinfo_cfg, samplesheet_csv, usebases_cfg]]) seqrunfailed(mongo_status_script, run_num, outdir, args.testing) sys.exit(0) # turn arguments into user_data that gets merged into pipeline config user_data = {'rundir': rundir, 'lanes_arg': lane_info, 'samplesheet_csv': samplesheet_csv, 'no_archive': args.no_archive, 'mail_on_completion': not args.no_mail, 'run_num': run_num} usebases_arg = '' with open(usebases_cfg, 'r') as stream: try: d = yaml.load(stream) assert 'usebases' in d assert len(d) == 1# make sure usebases is only key for ub in d['usebases']: #print (ub) usebases_arg += '--use-bases-mask {} '.format(ub) #user_data = {'usebases_arg' : usebases_arg} except yaml.YAMLError as exc: logger.fatal(exc) raise user_data['usebases_arg'] = usebases_arg os.unlink(usebases_cfg) mux_units = get_mux_units_from_cfgfile(muxinfo_cfg, lane_nos) if args.mismatches is not None: mux_units = [mu._replace(barcode_mismatches=args.mismatches) for mu in mux_units] os.unlink(muxinfo_cfg) user_data['units'] = dict() for mu in mux_units: # special case: mux split across multiple lanes. make lanes a list # and add in extra lanes if needed. k = mu.mux_dir mu_dict = dict(mu._asdict()) user_data['units'][k] = mu_dict # create mongodb update command, used later, after queueing mongo_update_cmd = "{} -r {} -s STARTED".format(mongo_status_script, user_data['run_num']) mongo_update_cmd += " -a $ANALYSIS_ID -o {}".format(outdir)# set in run.sh if args.testing: mongo_update_cmd += " -t" # NOTE: bcl2fastq has a special run template, so we need to # interfer with the default pipeline_handler. plenty of # opportunity to shoot yourself in the foot pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, outdir, user_data, site=site, master_q=args.master_q, slave_q=args.slave_q) # use local run template pipeline_handler.run_template = os.path.join( PIPELINE_BASEDIR, "run.template.{}.sh".format(pipeline_handler.site)) assert os.path.exists(pipeline_handler.run_template) pipeline_handler.setup_env() # final mongo update line in run_out tmp_run_out = pipeline_handler.run_out + ".tmp" with open(pipeline_handler.run_out) as fh_in, \ open(tmp_run_out, 'w') as fh_out: for line in fh_in: line = line.replace("@MONGO_UPDATE_CMD@", mongo_update_cmd) fh_out.write(line) shutil.move(tmp_run_out, pipeline_handler.run_out) pipeline_handler.submit(args.no_run)
def main(): """main function """ parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version())) # generic args parser.add_argument('-o', "--outdir", required=True, help="Output directory (must not exist)") parser.add_argument('--name', help="Give this analysis run a name (used in email and report)") parser.add_argument('--no-mail', action='store_true', help="Don't send mail on completion") site = get_site() default = DEFAULT_SLAVE_Q.get(site, None) parser.add_argument('-w', '--slave-q', default=default, help="Queue to use for slave jobs (default: {})".format(default)) default = DEFAULT_MASTER_Q.get(site, None) parser.add_argument('-m', '--master-q', default=default, help="Queue to use for master job (default: {})".format(default)) parser.add_argument('-n', '--no-run', action='store_true') parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") cfg_group = parser.add_argument_group('Configuration files (advanced)') cfg_group.add_argument('--prev-cfg', help="Previously used config. Also used to infer path to precalculated BAM files") for name, descr in [("references", "reference sequences"), ("params", "parameters"), ("modules", "modules")]: default = os.path.abspath(os.path.join(CFG_DIR, "{}.yaml".format(name))) cfg_group.add_argument('--{}-cfg'.format(name), default=default, help="Config-file (yaml) for {}. (default: {})".format(descr, default)) # pipeline specific args #parser.add_argument('-1', "--fq1", nargs="+", # help="FastQ file/s (gzip only)." # " Multiple input files supported (auto-sorted)." # " Note: each file (or pair) gets a unique read-group id." # " Collides with --sample-cfg.") #parser.add_argument('-2', "--fq2", nargs="+", # help="FastQ file/s (if paired) (gzip only). See also --fq1") #parser.add_argument('-s', "--sample", # help="Sample name. Collides with --sample-cfg.") #parser.add_argument('-t', "--seqtype", required=True, # choices=['WGS', 'WES', 'targeted'], # help="Sequencing type") #parser.add_argument('-l', "--intervals", # help="Intervals file (e.g. bed file) listing regions of interest." # " Required for WES and targeted sequencing.") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) if os.path.exists(args.outdir): logger.fatal("Output directory %s already exists", args.outdir) sys.exit(1) # samples is a dictionary with sample names as key (mostly just # one) and readunit keys as value. readunits is a dict with # readunits (think: fastq pairs with attributes) as value #if args.sample_cfg: # if any([args.fq1, args.fq2, args.sample]): # logger.fatal("Config file overrides fastq and sample input arguments." # " Use one or the other") # sys.exit(1) # if not os.path.exists(args.sample_cfg): # logger.fatal("Config file %s does not exist", args.sample_cfg) # sys.exit(1) # samples, readunits = get_samples_and_readunits_from_cfgfile(args.sample_cfg) #else: # if not all([args.fq1, args.sample]): # logger.fatal("Need at least fq1 and sample without config file") # sys.exit(1) # # readunits = get_readunits_from_args(args.fq1, args.fq2) # # all readunits go into this one sample specified on the command-line # samples = dict() # samples[args.sample] = list(readunits.keys()) # #if args.seqtype in ['WES', 'targeted']: # if not args.intervals: # logger.fatal("Analysis of exome and targeted sequence runs requires a bed file") # sys.exit(1) # else: # if not os.path.exists(args.intervals): # logger.fatal("Intervals file %s does not exist", args.sample_cfg) # sys.exit(1) # logger.warning("Compatilibity between interval file and" # " reference not checked")# FIXME with open(args.prev_cfg, 'r') as stream: try: prev_cfg = yaml.load(stream) except yaml.YAMLError as exc: logger.fatal("Error loading %s", REST_CFG) raise #import pdb; pdb.set_trace() #sys.stderr.write("TMP DEBUG {}\n".format(prev_cfg)) # turn arguments into user_data that gets merged into pipeline config # # generic data first user_data = dict() user_data['mail_on_completion'] = not args.no_mail #user_data['readunits'] = prev_cfg['readunits'] user_data['readunits'] = dict()# None won't work #user_data['samples'] = samples user_data['samples'] = prev_cfg['samples'] if args.name: user_data['analysis_name'] = args.name #user_data['seqtype'] = args.seqtype user_data['seqtype'] = 'WGS'# SG10K #user_data['intervals'] = args.intervals# always safe, might be used for WGS as well user_data['intervals'] = None#SG10K user_data['mark_dups'] = None# SG10K doesn't matter user_data['precalc_bam_dir'] = os.path.join( os.path.abspath(os.path.dirname(args.prev_cfg)), "out") pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args.outdir, user_data, site=site, master_q=args.master_q, slave_q=args.slave_q, params_cfgfile=args.params_cfg, modules_cfgfile=args.modules_cfg, refs_cfgfile=args.references_cfg, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(toaddr): subject = "Test email from {} version {}".format(get_site(), get_pipeline_version()) body = "Email wursts.\n\nSincerely,\nRPD" send_mail(subject, body, toaddr=toaddr, ccaddr=None, pass_exception=False)