def pipeline(dataset): """ Decide which pipeline to run on a dataset, and run luigi.build. :param analysis_driver.dataset.Dataset dataset: """ luigi.interface.setup_interface_logging.has_run = True # turn off Luigi's default logging setup log_cfg.get_logger('luigi-interface', 20) # just calling log_cfg.get_logger registers the luigi-interface dataset.resolve_pipeline_and_toolset() dataset.start() final_stage = dataset.pipeline.build_pipeline(dataset) luigi_params = { 'tasks': [final_stage], 'local_scheduler': cfg.query('luigi', 'local_scheduler'), 'workers': cfg.query('luigi', 'max_parallel_jobs', ret_default=4) } if luigi_params['local_scheduler'] is not True: luigi_params['scheduler_url'] = cfg['luigi']['scheduler_url'] success = luigi.build(**luigi_params) # if any exception occurred during the pipeline raise them here again dataset.raise_exceptions() return 0 if success is True else 9
def __init__(self, dataset, window_size=60, tile_quality_threshold=None, cycle_quality_threshold=None): self.dataset = dataset self.run_dir = dataset.input_dir self.tile_ids = dataset.run_info.tiles self.ncycles = sum(Reads.num_cycles(r) for r in dataset.run_info.reads.reads) self.window_size = window_size self.all_lanes = None self.tile_quality_threshold = tile_quality_threshold or cfg.query( 'fastq_filterer', 'tile_quality_threshold', ret_default=20 ) self.cycle_quality_threshold = cycle_quality_threshold or cfg.query( 'fastq_filterer', 'cycle_quality_threshold', ret_default=18 ) self.read_interop_metrics()
def get_genome_version(sample_id, species=None): s = get_sample(sample_id) if not s: return None genome_version = s.udf.get('Genome Version', None) if not genome_version and species: return cfg.query('species', species, 'default') return genome_version
def __init__(self, *cmds, prelim_cmds=None, **cluster_config): """ :param cmds: Full path to a job submission script """ self.interval = cfg.query('executor', 'join_interval', ret_default=30) self.job_id = None self.job_name = cluster_config['job_name'] self.cmds = cmds self.prelim_cmds = prelim_cmds self.writer = self.script_writer(**cluster_config)
def execute(*cmds, env=None, prelim_cmds=None, **cluster_config): if env is None: env = cfg.query('executor', 'job_execution') if env == 'local': return local_execute(*cmds) else: return cluster_execute(*cmds, env=env, prelim_cmds=prelim_cmds, **cluster_config)
def write_script(self): if self.prelim_cmds: self.writer.register_cmds(*self.prelim_cmds, parallel=False) pre_job_source = cfg.query('executor', 'pre_job_source') if pre_job_source: self.writer.register_cmd('source ' + pre_job_source) self.writer.line_break() self.writer.register_cmds(*self.cmds, parallel=True) self.writer.add_header() self.writer.save()
def cluster_execute(*cmds, env=None, prelim_cmds=None, **cluster_config): """ Execute commands on a compute cluster :param cmds: :param env: The kind of resource manager being run :param prelim_cmds: Any commands to execute before starting a job array :param cluster_config: :return: ClusterExecutor """ env = env or cfg.query('executor', 'job_execution') if env == 'slurm': cls = SlurmExecutor else: raise EGCGError('Unknown execution environment: %s' % env) e = cls(*cmds, prelim_cmds=prelim_cmds, **cluster_config) e.start() return e
def _run(self): # Assess if the lanes need filtering q30_threshold = float(cfg.query('fastq_filterer', 'q30_threshold', ret_default=74)) self.info('Q30 threshold: %s', q30_threshold) filter_lanes = {1: False, 2: False, 3: False, 4: False, 5: False, 6: False, 7: False, 8: False} for lane in self.dataset.lane_metrics: if q30_threshold > float(util.query_dict(lane, 'aggregated.pc_q30', ret_default=0)) > 0: self.warning( 'Will apply cycle and tile filtering to lane %s: %%Q30=%s < %s', lane['lane_number'], lane['aggregated']['pc_q30'], q30_threshold ) filter_lanes[int(lane['lane_number'])] = True try: detector = BadTileCycleDetector(self.dataset) bad_tiles = detector.detect_bad_tiles() bad_cycles = detector.detect_bad_cycles() except Exception as e: self.error(e) bad_tiles = {} bad_cycles = {} cmds = [] for lane in filter_lanes: fq_pairs = find_all_fastq_pairs_for_lane(self.fastq_dir, lane) kwargs = {} if filter_lanes[lane]: trim_r1, trim_r2 = get_trim_values_for_bad_cycles(bad_cycles.get(lane), self.dataset.run_info) kwargs = {'tiles_to_filter': bad_tiles.get(lane), 'trim_r2': trim_r2} for fqs in fq_pairs: read_name_list = fqs[0][:-len('_R1_001.fastq.gz')] + '_phix_read_name.list' cmds.append(bash_commands.fastq_filterer(fqs, read_name_list, **kwargs)) return executor.execute( *cmds, prelim_cmds=[bash_commands.fq_filt_prelim_cmd()], job_name='fastq_filterer', working_dir=self.job_dir, cpus=18, mem=10 ).join()
def _req(self, method, url, quiet=False, retries=5, **kwargs): # can't upload json and files at the same time, so we need to move the json parameter to data # data can't upload complex structures that would require json encoding. # this means we can't upload data with nested lists/dicts at the same time as files if kwargs.get('files') and kwargs.get('json'): if check_if_nested(kwargs.get('json')): raise RestCommunicationError( 'Cannot upload files and nested json in one query') kwargs['data'] = kwargs.pop('json') try: with self.lock: r = self.session.request(method, url, **kwargs) except Exception as e: if retries > 0: self.warning( 'Encountered a %s exception. %s retries remaining', str(e), retries) sleep(cfg.query('rest_api', 'retry_interval', ret_default=1)) return self._req(method, url, quiet, retries - 1, **kwargs) else: raise kwargs.pop('files', None) # e.g: 'POST <url> ({"some": "args"}) -> {"some": "content"}. Status code 201. Reason: CREATED report = '%s %s (%s) -> %s. Status code %s. Reason: %s' % ( r.request.method, r.request.path_url, kwargs, r.content.decode('utf-8'), r.status_code, r.reason) if r.status_code in self.successful_statuses: if not quiet: self.debug(report) return r else: self.error(report) raise RestCommunicationError('Encountered a %s status code: %s' % (r.status_code, r.reason))
def __init__(self): self.input_dir = cfg.query(self.type, 'input_dir') self.__triggerignore = None