def find_projects_from_samples(sample_list): """Given a list of samples, attempts to determine which projects they belong to using Charon records. :param list sample_list: A list of the samples for which to find projects :returns: a dict of {project_id: set(samples)} :rtype: dict of sets :raises ValueError: If you fail to pass in a list. Nice work! """ STHLM_SAMPLE_RE = re.compile(r'(P\d{4})_') projects_dict = collections.defaultdict(set) samples_by_project_id = {} no_owners_found = set() multiple_owners_found = set() charon_session = CharonSession() if not type(sample_list) is list: raise ValueError("Input should be list.") for sample_name in sample_list: # First see if we can just parse out the project id from the sample name m = STHLM_SAMPLE_RE.match(sample_name) if m: project_id = m.groups()[0] try: # Ensure that we guessed right charon_session.sample_get(project_id, sample_name) except CharonError as e: LOG.debug('Project for sample "{}" appears to be "{}" but is not ' 'present in Charon ({})'.format(sample_name, project_id, e)) no_owners_found.add(sample_name) else: projects_dict[project_id].add(sample_name) else: # Otherwise check all the projects for matching samples (returns list or None) owner_projects_list = charon_session.sample_get_projects(sample_name) if not owner_projects_list: no_owners_found.add(sample_name) elif len(owner_projects_list) > 1: multiple_owners_found.add(sample_name) else: projects_dict[owner_projects_list[0]].add(sample_name) if no_owners_found: LOG.warn("No projects found for the following samples: {}".format(", ".join(no_owners_found))) if multiple_owners_found: LOG.warn('Multiple projects found with the following samples (owner ' 'could not be unamibugously determined): {}'.format(", ".join(multiple_owners_found))) return dict(projects_dict)
def add_supr_name_delivery_in_charon(self, supr_name_of_delivery): '''Updates delivery_projects in Charon at project level ''' charon_session = CharonSession() try: #fetch the project sample_charon = charon_session.sample_get(self.projectid, self.sampleid) delivery_projects = sample_charon['delivery_projects'] if supr_name_of_delivery not in sample_charon: delivery_projects.append(supr_name_of_delivery) charon_session.sample_update( self.projectid, self.sampleid, delivery_projects=delivery_projects) logger.info( 'Charon delivery_projects for sample {} updated with value {}' .format(self.sampleid, supr_name_of_delivery)) else: logger.warn( 'Charon delivery_projects for sample {} not updated with value {} because the value was already present' .format(self.sampleid, supr_name_of_delivery)) except Exception, e: logger.error( 'Failed to update delivery_projects in charon while delivering {}. Error says: {}' .format(self.sampleid, e)) logger.exception(e)
def analyze(analysis_object, config=None, config_file_path=None): charon_session = CharonSession() charon_pj=charon_session.project_get(analysis_object.project.project_id) reference_genome=charon_pj.get('reference') if charon_pj.get("sequencing_facility") == "NGI-S": analysis_object.sequencing_facility="sthlm" elif charon_pj.get("sequencing_facility") == "NGI-U": analysis_object.sequencing_facility="upps" else: LOG.error("charon project not registered with stockholm or uppsala. Which config file should we use for the RNA pipeline ?") raise RuntimeError fastq_files=[] if reference_genome and reference_genome != 'other': for sample in analysis_object.project: try: charon_reported_status = charon_session.sample_get(analysis_object.project.project_id, sample).get('analysis_status') # Check Charon to ensure this hasn't already been processed do_analyze=handle_sample_status(analysis_object, sample, charon_reported_status) if not do_analyze : continue except CharonError as e: LOG.error(e) for libprep in sample: charon_lp_status=charon_session.libprep_get(analysis_object.project.project_id, sample.name, libprep.name).get('qc') do_analyze=handle_libprep_status(analysis_object, libprep, charon_lp_status) if not do_analyze : continue else: for seqrun in libprep: charon_sr_status=charon_session.seqrun_get(analysis_object.project.project_id, sample.name, libprep.name, seqrun.name).get('alignment_status') do_analyze=handle_seqrun_status(analysis_object, seqrun, charon_sr_status) if not do_analyze : continue else: seqrun.being_analyzed=True sample.being_analyzed = sample.being_analyzed or True # filter out index files from analysis for fastq_file in filter(lambda f: not is_index_file(f), seqrun.fastq_files): fastq_path=os.path.join(analysis_object.project.base_path, "DATA", analysis_object.project.project_id, sample.name, libprep.name, seqrun.name, fastq_file) fastq_files.append(fastq_path) if not fastq_files: LOG.error("No fastq files obtained for the analysis fo project {}, please check the Charon status.".format(analysis_object.project.name)) else : if analysis_object.restart_running_jobs: stop_ongoing_analysis(analysis_object) fastq_dir=preprocess_analysis(analysis_object, fastq_files) sbatch_path=write_batch_job(analysis_object, reference_genome, fastq_dir) job_id=start_analysis(sbatch_path) analysis_path=os.path.join(analysis_object.project.base_path, "ANALYSIS", analysis_object.project.project_id, 'rna_ngi') record_project_job(analysis_object.project, job_id, analysis_path)
def analyze_sample(project, sample, config=None, config_file_path=None): """Analyze data at the sample level. :param NGIProject project: the project to analyze :param NGISample sample: the sample to analyzed :param dict config: The parsed configuration file (optional) :param str config_file_path: The path to the configuration file (optional) """ modules_to_load = ["java/sun_jdk1.7.0_25", "R/2.15.0"] load_modules(modules_to_load) charon_session = CharonSession() # Determine if we can begin sample-level processing yet. # Conditions are that the coverage is above 28.9X # If these conditions become more complex we can create a function for this sample_total_autosomal_coverage = charon_session.sample_get(project.project_id, sample.name).get('total_autosomal_coverage') if sample_total_autosomal_coverage > 28.4: LOG.info('Sample "{}" in project "{}" is ready for processing.'.format(sample, project)) for workflow_subtask in get_subtasks_for_level(level="sample"): if not is_sample_analysis_running_local(workflow_subtask=workflow_subtask, project_id=project.project_id, sample_id=sample.name): try: ## Temporarily logging to a file until we get ELK set up log_file_path = create_log_file_path(workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.name, sample_id=sample.name) rotate_log(log_file_path) # Store the exit code of detached processes exit_code_path = create_exit_code_file_path(workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.name, sample_id=sample.name) build_setup_xml(project, config, sample) command_line = build_piper_cl(project, workflow_subtask, exit_code_path, config) p_handle = launch_piper_job(command_line, project, log_file_path) try: record_process_sample(project=project, sample=sample, workflow_subtask=workflow_subtask, analysis_module_name="piper_ngi", analysis_dir=project.analysis_dir, pid=p_handle.pid) except RuntimeError as e: LOG.error(e) continue except (NotImplementedError, RuntimeError) as e: error_msg = ('Processing project "{}" / sample "{}" failed: ' '{}'.format(project, sample, e.__repr__())) LOG.error(error_msg) else: LOG.info('Sample "{}" in project "{}" is not yet ready for ' 'processing.'.format(sample, project))
def update_gt_status_in_charon(sample_id, status, concordance=None): project_id = sample_id.split('_')[0] try: charon_session = CharonSession() sample = charon_session.sample_get(project_id, sample_id) if concordance is None: if sample.get('genotype_status') != status: charon_session.sample_update(projectid=project_id, sampleid=sample_id,genotype_status=status) else: if sample.get('genotype_status') != status or sample.get('genotype_concordance') != concordance: charon_session.sample_update(projectid=project_id, sampleid=sample_id,genotype_status=status, genotype_concordance=concordance) except CharonError as e: return str(e)
def main(inbox=None, num_days=14, genotype_files=None, config=None, config_file_path=None): if genotype_files: gt_files_valid = [os.path.abspath(gt_file) for gt_file in genotype_files] else: if not inbox: try: inboxes = config["environment"]["flowcell_inbox"] except (KeyError, TypeError): raise ValueError("No path to delivery inbox specified by argument " "or in configuration file ({}). Exiting.".format(config_file_path)) for inbox in inboxes: inbox = os.path.abspath(inbox) # Convert to seconds cutoff_age = time.time() - (int(num_days) * 24 * 60 * 60) LOG.info("Searching for genotype files under {} modified after " "{}".format(inbox, time.ctime(cutoff_age))) gt_files_valid = [] for gt_file in filter(GENOTYPE_FILE_RE.match, glob.glob(os.path.join(inbox, "*"))): if os.stat(gt_file).st_mtime > time.time() - cutoff_age: gt_files_valid.append(os.path.abspath(gt_file)) if not gt_files_valid: LOG.info("No genotype files found under {} newer than " "{}".format(inbox, time.ctime(cutoff_age))) else: charon_session = CharonSession() for gt_file_path in gt_files_valid: project_samples_dict = \ find_projects_from_samples(parse_samples_from_vcf(gt_file_path)) for project_id, samples in project_samples_dict.iteritems(): LOG.info("Updating project {}...".format(project_id)) for sample in samples: try: genotype_status = \ charon_session.sample_get(projectid=project_id, sampleid=sample).get("genotype_status") if genotype_status in (None, "NOT_AVAILABLE"): LOG.info('Updating sample {} genotype_status ' 'to "AVAILABLE"...'.format(sample)) charon_session.sample_update(projectid=project_id, sampleid=sample, genotype_status="AVAILABLE") else: LOG.info('Not updating sample {} genotype_status ' '(already "{}")'.format(sample, genotype_status)) except CharonError as e: LOG.error('Could not update genotype status to "AVAILABLE" ' 'for project/sample "{}/{}": {}'.format(project_id, sample, e))
def add_supr_name_delivery_in_charon(self, supr_name_of_delivery): '''Updates delivery_projects in Charon at project level ''' charon_session = CharonSession() try: #fetch the project sample_charon = charon_session.sample_get(self.projectid, self.sampleid) delivery_projects = sample_charon['delivery_projects'] if supr_name_of_delivery not in sample_charon: delivery_projects.append(supr_name_of_delivery) charon_session.sample_update(self.projectid, self.sampleid, delivery_projects=delivery_projects) logger.info('Charon delivery_projects for sample {} updated with value {}'.format(self.sampleid, supr_name_of_delivery)) else: logger.warn('Charon delivery_projects for sample {} not updated with value {} because the value was already present'.format(self.sampleid, supr_name_of_delivery)) except Exception, e: logger.error('Failed to update delivery_projects in charon while delivering {}. Error says: {}'.format(self.sampleid, e)) logger.exception(e)
def add_dds_name_delivery_in_charon(self, name_of_delivery): """Updates delivery_projects in Charon at project level """ charon_session = CharonSession() try: # Fetch the project sample_charon = charon_session.sample_get(self.projectid, self.sampleid) delivery_projects = sample_charon['delivery_projects'] if name_of_delivery not in sample_charon: delivery_projects.append(name_of_delivery) charon_session.sample_update(self.projectid, self.sampleid, delivery_projects=delivery_projects) logger.info('Charon delivery_projects for sample {} updated ' 'with value {}'.format(self.sampleid, name_of_delivery)) else: logger.warn('Charon delivery_projects for sample {} not updated ' 'with value {} because the value was already present'.format(self.sampleid, name_of_delivery)) except Exception as e: logger.exception('Failed to update delivery_projects in charon while delivering {}.'.format(self.sampleid))
def update_gt_status_in_charon(sample_id, status, concordance=None): project_id = sample_id.split('_')[0] try: charon_session = CharonSession() sample = charon_session.sample_get(project_id, sample_id) if concordance is None: if sample.get('genotype_status') != status: charon_session.sample_update(projectid=project_id, sampleid=sample_id, genotype_status=status) else: if sample.get('genotype_status') != status or sample.get( 'genotype_concordance') != concordance: charon_session.sample_update(projectid=project_id, sampleid=sample_id, genotype_status=status, genotype_concordance=concordance) except CharonError as e: return str(e)
def launch_analysis(projects_to_analyze, restart_failed_jobs=False, restart_finished_jobs=False, restart_running_jobs=False, keep_existing_data=False, no_qc=False, exec_mode="sbatch", quiet=False, manual=False, config=None, config_file_path=None, generate_bqsr_bam=False): """Launch the appropriate analysis for each fastq file in the project. :param list projects_to_analyze: The list of projects (Project objects) to analyze :param dict config: The parsed NGI configuration file; optional/has default. :param str config_file_path: The path to the NGI configuration file; optional/has default. """ for project in projects_to_analyze: # Get information from Charon regarding which best practice analyses to run try: engine = get_engine_for_bp(project, config, config_file_path) except (RuntimeError, CharonError) as e: LOG.error('Project {} could not be processed: {}'.format( project, e)) continue engine.local_process_tracking.update_charon_with_local_jobs_status( config=config) charon_session = CharonSession() for project in projects_to_analyze: try: project_status = charon_session.project_get( project.project_id)['status'] except CharonError as e: LOG.error('Project {} could not be processed: {}'.format( project, e)) continue if not project_status == "OPEN": error_text = ( 'Data found on filesystem for project "{}" but Charon ' 'reports its status is not OPEN ("{}"). Not launching ' 'analysis for this project.'.format(project, project_status)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project.name, level="ERROR", info_text=error_text) continue try: analysis_module = get_engine_for_bp(project) except (RuntimeError, CharonError) as e: # BPA missing from Charon? LOG.error('Skipping project "{}" because of error: {}'.format( project, e)) continue if not no_qc: try: qc_analysis_module = load_engine_module("qc", config) except RuntimeError as e: LOG.error("Could not launch qc analysis: {}".format(e)) for sample in project: # Launch QC analysis if not no_qc: try: LOG.info('Attempting to launch sample QC analysis ' 'for project "{}" / sample "{}" / engine ' '"{}"'.format(project, sample, qc_analysis_module.__name__)) qc_analysis_module.analyze(project=project, sample=sample, config=config) except Exception as e: error_text = ( 'Cannot process project "{}" / sample "{}" / ' 'engine "{}" : {}'.format(project, sample, analysis_module.__name__, e)) LOG.error(error_text) if not config.get("quiet"): mail_analysis(project_name=project.name, sample_name=sample.name, engine_name=analysis_module.__name__, level="ERROR", info_text=e) # Launch actual best-practice analysis try: charon_reported_status = charon_session.sample_get( project.project_id, sample).get('analysis_status') # Check Charon to ensure this hasn't already been processed if charon_reported_status == "UNDER_ANALYSIS": if not restart_running_jobs: error_text = ( 'Charon reports seqrun analysis for project "{}" ' '/ sample "{}" does not need processing (already ' '"{}")'.format(project, sample, charon_reported_status)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project.name, sample_name=sample.name, engine_name=analysis_module.__name__, level="ERROR", info_text=error_text) continue elif charon_reported_status == "ANALYZED": if not restart_finished_jobs: error_text = ( 'Charon reports seqrun analysis for project "{}" ' '/ sample "{}" does not need processing (already ' '"{}")'.format(project, sample, charon_reported_status)) LOG.error(error_text) if not config.get('quiet') and not config.get( 'manual'): mail_analysis(project_name=project.name, sample_name=sample.name, engine_name=analysis_module.__name__, level="ERROR", info_text=error_text) continue elif charon_reported_status == "FAILED": if not restart_failed_jobs: error_text = ( 'FAILED: Project "{}" / sample "{}" Charon reports ' 'FAILURE, manual investigation needed!'.format( project, sample)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project.name, sample_name=sample.name, engine_name=analysis_module.__name__, level="ERROR", info_text=error_text) continue except CharonError as e: LOG.error(e) continue try: LOG.info('Attempting to launch sample analysis for ' 'project "{}" / sample "{}" / engine' '"{}"'.format(project, sample, analysis_module.__name__)) #actual analysis launch analysis_module.analyze( project=project, sample=sample, restart_finished_jobs=restart_finished_jobs, restart_running_jobs=restart_running_jobs, keep_existing_data=keep_existing_data, exec_mode=exec_mode, config=config, generate_bqsr_bam=generate_bqsr_bam) except Exception as e: error_text = ('Cannot process project "{}" / sample "{}" / ' 'engine "{}" : {}'.format( project, sample, analysis_module.__name__, e)) LOG.error(error_text) if not config.get("quiet"): mail_analysis(project_name=project.name, sample_name=sample.name, engine_name=analysis_module.__name__, level="ERROR", info_text=e) continue
def update_charon_with_local_jobs_status(): """Check the status of all locally-tracked jobs and update Charon accordingly. """ LOG.info("Updating Charon with the status of all locally-tracked jobs...") with get_db_session() as session: charon_session = CharonSession() # Sequencing Run Analyses for seqrun_entry in session.query(SeqrunAnalysis).all(): # Local names workflow = seqrun_entry.workflow project_name = seqrun_entry.project_name project_id = seqrun_entry.project_id project_base_path = seqrun_entry.project_base_path sample_id = seqrun_entry.sample_id libprep_id = seqrun_entry.libprep_id seqrun_id = seqrun_entry.seqrun_id pid = seqrun_entry.process_id exit_code = get_exit_code(workflow_name=workflow, project_base_path=project_base_path, project_name=project_name, sample_id=sample_id, libprep_id=libprep_id, seqrun_id=seqrun_id) label = "project/sample/libprep/seqrun {}/{}/{}/{}".format(project_name, sample_id, libprep_id, seqrun_id) try: if exit_code == 0: # 0 -> Job finished successfully LOG.info('Workflow "{}" for {} finished succesfully. ' 'Recording status "DONE" in Charon'.format(workflow, label)) set_alignment_status = "DONE" try: write_to_charon_alignment_results(base_path=project_base_path, project_name=project_name, project_id=project_id, sample_id=sample_id, libprep_id=libprep_id, seqrun_id=seqrun_id) except (RuntimeError, ValueError) as e: LOG.error(e) set_alignment_status = "FAILED" charon_session.seqrun_update(projectid=project_id, sampleid=sample_id, libprepid=libprep_id, seqrunid=seqrun_id, alignment_status=set_alignment_status) # Job is only deleted if the Charon update succeeds session.delete(seqrun_entry) elif exit_code == 1 or (not psutil.pid_exists(pid) and not exit_code): if exit_code == 1: # 1 -> Job failed (DATA_FAILURE / COMPUTATION_FAILURE ?) LOG.info('Workflow "{}" for {} failed. Recording status ' '"FAILED" in Charon.'.format(workflow, label)) else: # Job failed without writing an exit code (process no longer running) LOG.error('ERROR: No exit code found for process {} ' 'but it does not appear to be running ' '(pid {} does not exist). Setting status to ' '"FAILED", inspect manually'.format(label, pid)) charon_session.seqrun_update(projectid=project_id, sampleid=sample_id, libprepid=libprep_id, seqrunid=seqrun_id, alignment_status="FAILED") # Job is only deleted if the Charon update succeeds LOG.debug("Deleting local entry {}".format(seqrun_entry)) session.delete(seqrun_entry) else: # None -> Job still running charon_status = charon_session.seqrun_get(projectid=project_id, sampleid=sample_id, libprepid=libprep_id, seqrunid=seqrun_id)['alignment_status'] if not charon_status == "RUNNING": LOG.warn('Tracking inconsistency for {}: Charon status is "{}" but ' 'local process tracking database indicates it is running. ' 'Setting value in Charon to RUNNING.'.format(label, charon_status)) charon_session.seqrun_update(projectid=project_id, sampleid=sample_id, libprepid=libprep_id, seqrunid=seqrun_id, alignment_status="RUNNING") except CharonError as e: LOG.error('Unable to update Charon status for "{}": {}'.format(label, e)) for sample_entry in session.query(SampleAnalysis).all(): # Local names workflow = sample_entry.workflow project_name = sample_entry.project_name project_id = sample_entry.project_id project_base_path = sample_entry.project_base_path sample_id = sample_entry.sample_id pid = sample_entry.process_id exit_code = get_exit_code(workflow_name=workflow, project_base_path=project_base_path, project_name=project_name, sample_id=sample_id) label = "project/sample/libprep/seqrun {}/{}".format(project_name, sample_id) try: if exit_code == 0: # 0 -> Job finished successfully LOG.info('Workflow "{}" for {} finished succesfully. ' 'Recording status "DONE" in Charon'.format(workflow, label)) set_status = "DONE" ## TODO implement sample-level analysis results parsing / reporting to Charon? #try: # write_to_charon_alignment_results(base_path=project_base_path, # project_name=project_name, # project_id=project_id, # sample_id=sample_id, # libprep_id=libprep_id, # seqrun_id=seqrun_id) #except (RuntimeError, ValueError) as e: # LOG.error(e) # set_alignment_status = "FAILED" charon_session.sample_update(projectid=project_id, sampleid=sample_id, status=set_status) # Job is only deleted if the Charon update succeeds session.delete(sample_entry) elif exit_code == 1 or (not psutil.pid_exists(pid) and not exit_code): if exit_code == 1: # 1 -> Job failed (DATA_FAILURE / COMPUTATION_FAILURE ?) LOG.info('Workflow "{}" for {} failed. Recording status ' '"COMPUTATION_FAILED" in Charon.'.format(workflow, label)) else: # Job failed without writing an exit code LOG.error('ERROR: No exit code found for process {} ' 'but it does not appear to be running ' '(pid {} does not exist). Setting status to ' '"COMPUTATION_FAILED", inspect manually'.format(label, pid)) charon_session.sample_update(projectid=project_id, sampleid=sample_id, status="COMPUTATION_FAILED") # Job is only deleted if the Charon update succeeds session.delete(sample_entry) else: # None -> Job still running try: charon_status = charon_session.sample_get(projectid=project_id, sampleid=sample_id)['status'] except (CharonError, KeyError) as e: LOG.warn('Unable to get required information from Charon for ' 'sample "{}" / project "{}" -- forcing it to RUNNING: {}'.format(sample_id, project_id, e)) charon_status = "NEW" if not charon_status == "RUNNING": LOG.warn('Tracking inconsistency for {}: Charon status is "{}" but ' 'local process tracking database indicates it is running. ' 'Setting value in Charon to RUNNING.'.format(label, charon_status)) charon_session.sample_update(projectid=project_id, sampleid=sample_id, status="RUNNING") except CharonError as e: LOG.error('Unable to update Charon status for "{}": {}'.format(label, e)) session.commit()
def analyze(project, sample, exec_mode="sbatch", restart_finished_jobs=False, restart_running_jobs=False, keep_existing_data=False, level="sample", genotype_file=None, config=None, config_file_path=None, generate_bqsr_bam=False): """Analyze data at the sample level. :param NGIProject project: the project to analyze :param NGISample sample: the sample to analyzed :param str exec_mode: "sbatch" or "local" (local not implemented) :param bool restart_finished_jobs: Restart jobs that are already done (have a .done file) :param bool restart_running_jobs: Kill and restart currently-running jobs :param str level: The level on which to perform the analysis ("sample" or "genotype") :param str genotype_file: The path to the genotype file (only relevant for genotype analysis) :param dict config: The parsed configuration file (optional) :param str config_file_path: The path to the configuration file (optional) :raises ValueError: If exec_mode is an unsupported value """ if level == "sample": status_field = "alignment_status" elif level == "genotype": status_field = "genotype_status" else: LOG.warn('Unknown workflow level: "{}"'.format(level)) status_field = "alignment_status" # Or should we abort? try: check_for_preexisting_sample_runs(project, sample, restart_running_jobs, restart_finished_jobs, status_field) except RuntimeError as e: raise RuntimeError('Aborting processing of project/sample "{}/{}": ' '{}'.format(project, sample, e)) if exec_mode.lower() not in ("sbatch", "local"): raise ValueError('"exec_mode" param must be one of "sbatch" or "local" ' 'value was "{}"'.format(exec_mode)) if exec_mode == "local": modules_to_load = config.get("piper", {}).get("load_modules", []) load_modules(modules_to_load) for workflow_subtask in workflows.get_subtasks_for_level(level=level): if level == "genotype": genotype_status = None # Some records in Charon lack this field, I'm guessing try: charon_session = CharonSession() genotype_status = charon_session.sample_get(projectid=project.project_id, sampleid=sample.name).get("genotype_status") except CharonError as e: LOG.error('Couldn\'t determine genotyping status for project/' 'sample "{}/{}"; skipping analysis.'.format(project, sample)) continue if find_previous_genotype_analyses(project, sample) or genotype_status == "DONE": if not restart_finished_jobs: LOG.info('Project/sample "{}/{}" has completed genotype ' 'analysis previously; skipping (use flag to force ' 'analysis)'.format(project, sample)) continue if restart_running_jobs: # Kill currently-running jobs if they exist kill_running_sample_analysis(workflow_subtask=workflow_subtask, project_id=project.project_id, sample_id=sample.name) # This checks the local jobs database if not is_sample_analysis_running_local(workflow_subtask=workflow_subtask, project_id=project.project_id, sample_id=sample.name): LOG.info('Launching "{}" analysis for sample "{}" in project ' '"{}"'.format(workflow_subtask, sample, project)) try: log_file_path = create_log_file_path(workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.dirname, project_id=project.project_id, sample_id=sample.name) rotate_file(log_file_path) exit_code_path = create_exit_code_file_path(workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.dirname, project_id=project.project_id, sample_id=sample.name) if level == "sample": if not keep_existing_data: remove_previous_sample_analyses(project, sample) default_files_to_copy=None elif level == "genotype": if not keep_existing_data: remove_previous_genotype_analyses(project) default_files_to_copy=None # Update the project to keep only valid fastq files for setup.xml creation if level == "genotype": updated_project, default_files_to_copy = \ collect_files_for_sample_analysis(project, sample, restart_finished_jobs=True, status_field="genotype_status") else: updated_project, default_files_to_copy = \ collect_files_for_sample_analysis(project, sample, restart_finished_jobs, status_field="alignment_status") setup_xml_cl, setup_xml_path = build_setup_xml(project=updated_project, sample=sample, workflow=workflow_subtask, local_scratch_mode=(exec_mode == "sbatch"), config=config) piper_cl = build_piper_cl(project=project, workflow_name=workflow_subtask, setup_xml_path=setup_xml_path, exit_code_path=exit_code_path, config=config, exec_mode=exec_mode, generate_bqsr_bam=generate_bqsr_bam) if exec_mode == "sbatch": process_id = None slurm_job_id = sbatch_piper_sample([setup_xml_cl, piper_cl], workflow_subtask, project, sample, restart_finished_jobs=restart_finished_jobs, files_to_copy=default_files_to_copy) for x in xrange(10): # Time delay to let sbatch get its act together # (takes a few seconds to be visible with sacct) try: get_slurm_job_status(slurm_job_id) break except ValueError: time.sleep(2) else: LOG.error('sbatch file for sample {}/{} did not ' 'queue properly! Job ID {} cannot be ' 'found.'.format(project, sample, slurm_job_id)) else: # "local" raise NotImplementedError('Local execution not currently implemented. ' 'I\'m sure Denis can help you with this.') #slurm_job_id = None #launch_piper_job(setup_xml_cl, project) #process_handle = launch_piper_job(piper_cl, project) #process_id = process_handle.pid try: record_process_sample(project=project, sample=sample, analysis_module_name="piper_ngi", slurm_job_id=slurm_job_id, process_id=process_id, workflow_subtask=workflow_subtask) except RuntimeError as e: LOG.error(e) ## Question: should we just kill the run in this case or let it go? continue except (NotImplementedError, RuntimeError, ValueError) as e: error_msg = ('Processing project "{}" / sample "{}" / workflow "{}" ' 'failed: {}'.format(project, sample, workflow_subtask, e)) LOG.error(error_msg)
def update_charon_with_local_jobs_status(config=None, config_file_path=None): """Check the status of all locally-tracked jobs and update Charon accordingly. """ LOG.info("Updating Charon with the status of all locally-tracked jobs...") with get_db_session() as session: charon_session = CharonSession() for sample_entry in session.query(SampleAnalysis).all(): # Local names workflow = sample_entry.workflow project_name = sample_entry.project_name project_id = sample_entry.project_id project_base_path = sample_entry.project_base_path sample_id = sample_entry.sample_id engine=sample_entry.engine # Only one of these will have a value slurm_job_id = sample_entry.slurm_job_id process_id = sample_entry.process_id piper_exit_code = get_exit_code(workflow_name=workflow, project_base_path=project_base_path, project_name=project_name, project_id=project_id, sample_id=sample_id) label = "project/sample {}/{}".format(project_name, sample_id) try: project_obj = create_project_obj_from_analysis_log(project_name, project_id, project_base_path, sample_id, workflow) except IOError as e: # analysis log file is missing! error_text = ('Could not find analysis log file! Cannot update ' 'Charon for sample run {}/{}: {}'.format(project_id, sample_id, e)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text) continue try: if piper_exit_code and piper_exit_code == 0: # 0 -> Job finished successfully set_status = "ANALYZED" info_text = ('Workflow "{}" for {} finished succesfully. ' 'Recording status {} in Charon'.format(workflow, label, set_status)) LOG.info(info_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="INFO", info_text=info_text) charon_session.sample_update(projectid=project_id, sampleid=sample_id, analysis_status=set_status) recurse_status="DONE" recurse_status_for_sample(project_obj, recurse_status) # Job is only deleted if the Charon status update succeeds session.delete(sample_entry) # Parse seqrun output results / update Charon # This is a semi-optional step -- failure here will send an # email but not more than once. The record is still removed # from the local jobs database, so this will have to be done # manually if you want it done at all. piper_qc_dir = os.path.join(project_base_path, "ANALYSIS", project_id,"piper_ngi", "02_preliminary_alignment_qc") update_coverage_for_sample_seqruns(project_id, sample_id, piper_qc_dir) elif piper_exit_code and piper_exit_code >0: # 1 -> Job failed set_status = "FAILED" error_text = ('Workflow "{}" for {} failed. Recording status ' '{} in Charon.'.format(workflow, label, set_status)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text) charon_session.sample_update(projectid=project_id, sampleid=sample_id, analysis_status=set_status) recurse_status_for_sample(project_obj, set_status) # Job is only deleted if the Charon update succeeds session.delete(sample_entry) else: # None -> Job still running OR exit code was never written (failure) JOB_FAILED = None if slurm_job_id: try: slurm_exit_code = get_slurm_job_status(slurm_job_id) except ValueError as e: slurm_exit_code = 1 if slurm_exit_code is not None: # "None" indicates job is still running JOB_FAILED = True else: if not psutil.pid_exists(process_id): # Job did not write an exit code and is also not running JOB_FAILED = True if JOB_FAILED: set_status = "FAILED" error_text = ('No exit code found but job not running for ' '{}: setting status to {} in Charon'.format(label, set_status)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text) charon_session.sample_update(projectid=project_id, sampleid=sample_id, analysis_status=set_status) recurse_status_for_sample(project_obj, set_status) # Job is only deleted if the Charon update succeeds LOG.debug("Deleting local entry {}".format(sample_entry)) session.delete(sample_entry) else: # Job still running charon_status = charon_session.sample_get(projectid=project_id, sampleid=sample_id)['analysis_status'] if not charon_status == "UNDER_ANALYSIS": set_status = "UNDER_ANALYSIS" LOG.warn('Tracking inconsistency for {}: Charon status is "{}" but ' 'local process tracking database indicates it is running. ' 'Setting value in Charon to {}.'.format(label, charon_status, set_status)) charon_session.sample_update(projectid=project_id, sampleid=sample_id, analysis_status=set_status) recurse_status_for_sample(project_obj, "RUNNING") except CharonError as e: error_text = ('Unable to update Charon status for "{}": {}'.format(label, e)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text) except OSError as e: error_text = ('Permissions error when trying to update Charon ' 'status for "{}": {}'.format(label, e)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text) session.commit()
def analyze(analysis_object, config=None, config_file_path=None): charon_session = CharonSession() charon_pj = charon_session.project_get(analysis_object.project.project_id) reference_genome = charon_pj.get('reference') if charon_pj.get("sequencing_facility") == "NGI-S": analysis_object.sequencing_facility = "sthlm" elif charon_pj.get("sequencing_facility") == "NGI-U": analysis_object.sequencing_facility = "upps" else: LOG.error( "charon project not registered with stockholm or uppsala. Which config file should we use for the RNA pipeline ?" ) raise RuntimeError fastq_files = [] if reference_genome and reference_genome != 'other': for sample in analysis_object.project: try: charon_reported_status = charon_session.sample_get( analysis_object.project.project_id, sample).get('analysis_status') # Check Charon to ensure this hasn't already been processed do_analyze = handle_sample_status(analysis_object, sample, charon_reported_status) if not do_analyze: continue except CharonError as e: LOG.error(e) for libprep in sample: charon_lp_status = charon_session.libprep_get( analysis_object.project.project_id, sample.name, libprep.name).get('qc') do_analyze = handle_libprep_status(analysis_object, libprep, charon_lp_status) if not do_analyze: continue else: for seqrun in libprep: charon_sr_status = charon_session.seqrun_get( analysis_object.project.project_id, sample.name, libprep.name, seqrun.name).get('alignment_status') do_analyze = handle_seqrun_status( analysis_object, seqrun, charon_sr_status) if not do_analyze: continue else: seqrun.being_analyzed = True sample.being_analyzed = sample.being_analyzed or True for fastq_file in seqrun.fastq_files: fastq_path = os.path.join( analysis_object.project.base_path, "DATA", analysis_object.project.project_id, sample.name, libprep.name, seqrun.name, fastq_file) fastq_files.append(fastq_path) if not fastq_files: LOG.error( "No fastq files obtained for the analysis fo project {}, please check the Charon status." .format(analysis_object.project.name)) else: if analysis_object.restart_running_jobs: stop_ongoing_analysis(analysis_object) fastq_dir = preprocess_analysis(analysis_object, fastq_files) sbatch_path = write_batch_job(analysis_object, reference_genome, fastq_dir) job_id = start_analysis(sbatch_path) analysis_path = os.path.join(analysis_object.project.base_path, "ANALYSIS", analysis_object.project.project_id, 'rna_ngi') record_project_job(analysis_object.project, job_id, analysis_path)
def main(inbox=None, num_days=14, genotype_files=None, config=None, config_file_path=None): if genotype_files: gt_files_valid = [ os.path.abspath(gt_file) for gt_file in genotype_files ] else: if not inbox: try: inboxes = config["environment"]["flowcell_inbox"] except (KeyError, TypeError): raise ValueError( "No path to delivery inbox specified by argument " "or in configuration file ({}). Exiting.".format( config_file_path)) for inbox in inboxes: inbox = os.path.abspath(inbox) # Convert to seconds cutoff_age = time.time() - (int(num_days) * 24 * 60 * 60) LOG.info("Searching for genotype files under {} modified after " "{}".format(inbox, time.ctime(cutoff_age))) gt_files_valid = [] for gt_file in filter(GENOTYPE_FILE_RE.match, glob.glob(os.path.join(inbox, "*"))): if os.stat(gt_file).st_mtime > time.time() - cutoff_age: gt_files_valid.append(os.path.abspath(gt_file)) if not gt_files_valid: LOG.info("No genotype files found under {} newer than " "{}".format(inbox, time.ctime(cutoff_age))) else: charon_session = CharonSession() for gt_file_path in gt_files_valid: project_samples_dict = \ find_projects_from_samples(parse_samples_from_vcf(gt_file_path)) for project_id, samples in project_samples_dict.iteritems(): LOG.info("Updating project {}...".format(project_id)) for sample in samples: try: genotype_status = \ charon_session.sample_get(projectid=project_id, sampleid=sample).get("genotype_status") if genotype_status in (None, "NOT_AVAILABLE"): LOG.info('Updating sample {} genotype_status ' 'to "AVAILABLE"...'.format(sample)) charon_session.sample_update( projectid=project_id, sampleid=sample, genotype_status="AVAILABLE") else: LOG.info('Not updating sample {} genotype_status ' '(already "{}")'.format( sample, genotype_status)) except CharonError as e: LOG.error( 'Could not update genotype status to "AVAILABLE" ' 'for project/sample "{}/{}": {}'.format( project_id, sample, e))
def launch_analysis(level, projects_to_analyze, restart_failed_jobs=False, config=None, config_file_path=None): """Launch the appropriate seqrun (flowcell-level) analysis for each fastq file in the project. :param list projects_to_analyze: The list of projects (Project objects) to analyze :param dict config: The parsed NGI configuration file; optional/has default. :param str config_file_path: The path to the NGI configuration file; optional/has default. """ # Update Charon with the local state of all the jobs we're running update_charon_with_local_jobs_status() charon_session = CharonSession() for project in projects_to_analyze: # Get information from Charon regarding which workflows to run try: # E.g. "NGI" for NGI DNA Samples workflow = charon_session.project_get(project.project_id)["pipeline"] except (KeyError, CharonError) as e: # Workflow missing from Charon? LOG.error('Skipping project "{}" because of error: {}'.format(project, e)) continue try: analysis_engine_module_name = config["analysis"]["workflows"][workflow]["analysis_engine"] except KeyError: error_msg = ("No analysis engine for workflow \"{}\" specified " "in configuration file. Skipping this workflow " "for project {}".format(workflow, project)) LOG.error(error_msg) raise RuntimeError(error_msg) # Import the adapter module specified in the config file (e.g. piper_ngi) try: analysis_module = importlib.import_module(analysis_engine_module_name) except ImportError as e: error_msg = ('Skipping project "{}" workflow "{}": couldn\'t import ' 'module "{}": {}'.format(project, workflow, analysis_engine_module_name, e)) LOG.error(error_msg) # Next project continue # This is weird objects_to_process = [] if level == "sample": for sample in project: objects_to_process.append({"project": project, "sample": sample}) elif level == "seqrun": for sample in project: for libprep in sample: for seqrun in libprep: objects_to_process.append({"project": project, "sample": sample, "libprep": libprep, "seqrun": seqrun}) # Still weird and not so great for obj_dict in objects_to_process: project = obj_dict.get("project") sample = obj_dict.get("sample") libprep = obj_dict.get("libprep") seqrun = obj_dict.get("seqrun") try: if level == "seqrun": charon_reported_status = charon_session.seqrun_get(project.project_id, sample, libprep, seqrun)['alignment_status'] else: # sample-level charon_reported_status = charon_session.sample_get(project.project_id, sample)['status'] except (CharonError, KeyError) as e: LOG.warn('Unable to get required information from Charon for ' 'sample "{}" / project "{}" -- forcing it to new: {}'.format(sample, project, e)) if level == "seqrun": charon_session.seqrun_update(project.project_id, sample.name, libprep.name, seqrun.name, alignment_status="NEW") charon_reported_status = charon_session.seqrun_get(project.project_id, sample, libprep, seqrun)['alignment_status'] else: charon_session.sample_update(project.project_id, sample.name, status="NEW") charon_reported_status = charon_session.sample_get(project.project_id, sample)['status'] # Check Charon to ensure this hasn't already been processed if charon_reported_status in ("RUNNING", "DONE"): if level == "seqrun": LOG.info('Charon reports seqrun analysis for project "{}" / sample "{}" ' '/ libprep "{}" / seqrun "{}" does not need processing ' ' (already "{}")'.format(project, sample, libprep, seqrun, charon_reported_status)) else: # Sample LOG.info('Charon reports seqrun analysis for project "{}" / sample "{}" ' 'does not need processing ' ' (already "{}")'.format(project, sample, charon_reported_status)) continue elif charon_reported_status == "FAILED": if not restart_failed_jobs: if level == "seqrun": LOG.error('FAILED: Project "{}" / sample "{}" / library "{}" ' '/ flowcell "{}": Charon reports FAILURE, manual ' 'investigation needed!'.format(project, sample, libprep, seqrun)) else: # Sample LOG.error('FAILED: Project "{}" / sample "{}" Charon reports FAILURE, manual ' 'investigation needed!'.format(project, sample, libprep, seqrun)) continue try: # The engines themselves know which sub-workflows # they need to execute for a given level. For example, # with DNA Variant Calling on the sequencing run # level, we need to execute basic alignment and QC. if level == "seqrun": LOG.info('Attempting to launch seqrun analysis for ' 'project "{}" / sample "{}" / libprep "{}" ' '/ seqrun "{}", workflow "{}"'.format(project, sample, libprep, seqrun, workflow)) analysis_module.analyze_seqrun(project=project, sample=sample, libprep=libprep, seqrun=seqrun) else: # sample level LOG.info('Attempting to launch sample analysis for ' 'project "{}" / sample "{}" / workflow ' '"{}"'.format(project, sample, workflow)) analysis_module.analyze_sample(project=project, sample=sample) except Exception as e: raise LOG.error('Cannot process project "{}" / sample "{}" / ' 'libprep "{}" / seqrun "{}" / workflow ' '"{}" : {}'.format(project, sample, libprep, seqrun, workflow, e)) set_new_seqrun_status = "FAILED" continue
parser.add_argument("-s", "--sample", required=True) parser.add_argument("-c", "--coverage", type=int, required=True, dest="required_coverage") args = parser.parse_args() project = args.project sample = args.sample required_coverage = args.required_coverage charon_session = CharonSession() try: reported_coverage = charon_session.sample_get( project, sample).get("total_autosomal_coverage") except CharonError as e: try: project = get_project_id_from_name(project) except (CharonError, RuntimeError, ValueError) as e: print( ('ERROR: Could not determine coverage for project {} / sample ' '{}: {}'.format(project, sample, e)), file=sys.stderr) reported_coverage = 0 else: reported_coverage = charon_session.sample_get( project, sample).get("total_autosomal_coverage") if int(reported_coverage) >= int(required_coverage): sys.exit(0) else:
def launch_analysis(projects_to_analyze, restart_failed_jobs=False, restart_finished_jobs=False, restart_running_jobs=False, keep_existing_data=False, no_qc=False, exec_mode="sbatch", quiet=False, manual=False, config=None, config_file_path=None, generate_bqsr_bam=False): """Launch the appropriate analysis for each fastq file in the project. :param list projects_to_analyze: The list of projects (Project objects) to analyze :param dict config: The parsed NGI configuration file; optional/has default. :param str config_file_path: The path to the NGI configuration file; optional/has default. """ for project in projects_to_analyze: # Get information from Charon regarding which best practice analyses to run try: engine = get_engine_for_bp(project, config, config_file_path) except (RuntimeError, CharonError) as e: LOG.error('Project {} could not be processed: {}'.format(project, e)) continue engine.local_process_tracking.update_charon_with_local_jobs_status(config=config) charon_session = CharonSession() for project in projects_to_analyze: try: project_status = charon_session.project_get(project.project_id)['status'] except CharonError as e: LOG.error('Project {} could not be processed: {}'.format(project, e)) continue if not project_status == "OPEN": error_text = ('Data found on filesystem for project "{}" but Charon ' 'reports its status is not OPEN ("{}"). Not launching ' 'analysis for this project.'.format(project, project_status)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project.name, level="ERROR", info_text=error_text) continue try: analysis_module = get_engine_for_bp(project) except (RuntimeError, CharonError) as e: # BPA missing from Charon? LOG.error('Skipping project "{}" because of error: {}'.format(project, e)) continue if not no_qc: try: qc_analysis_module = load_engine_module("qc", config) except RuntimeError as e: LOG.error("Could not launch qc analysis: {}".format(e)) for sample in project: # Launch QC analysis if not no_qc: try: LOG.info('Attempting to launch sample QC analysis ' 'for project "{}" / sample "{}" / engine ' '"{}"'.format(project, sample, qc_analysis_module.__name__)) qc_analysis_module.analyze(project=project, sample=sample, config=config) except Exception as e: error_text = ('Cannot process project "{}" / sample "{}" / ' 'engine "{}" : {}'.format(project, sample, analysis_module.__name__, e)) LOG.error(error_text) if not config.get("quiet"): mail_analysis(project_name=project.name, sample_name=sample.name, engine_name=analysis_module.__name__, level="ERROR", info_text=e) # Launch actual best-practice analysis try: charon_reported_status = charon_session.sample_get(project.project_id, sample).get('analysis_status') # Check Charon to ensure this hasn't already been processed if charon_reported_status == "UNDER_ANALYSIS": if not restart_running_jobs: error_text = ('Charon reports seqrun analysis for project "{}" ' '/ sample "{}" does not need processing (already ' '"{}")'.format(project, sample, charon_reported_status)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project.name, sample_name=sample.name, engine_name=analysis_module.__name__, level="ERROR", info_text=error_text) continue elif charon_reported_status == "ANALYZED": if not restart_finished_jobs: error_text = ('Charon reports seqrun analysis for project "{}" ' '/ sample "{}" does not need processing (already ' '"{}")'.format(project, sample, charon_reported_status)) LOG.error(error_text) if not config.get('quiet') and not config.get('manual'): mail_analysis(project_name=project.name, sample_name=sample.name, engine_name=analysis_module.__name__, level="ERROR", info_text=error_text) continue elif charon_reported_status == "FAILED": if not restart_failed_jobs: error_text = ('FAILED: Project "{}" / sample "{}" Charon reports ' 'FAILURE, manual investigation needed!'.format(project, sample)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project.name, sample_name=sample.name, engine_name=analysis_module.__name__, level="ERROR", info_text=error_text) continue except CharonError as e: LOG.error(e) continue try: LOG.info('Attempting to launch sample analysis for ' 'project "{}" / sample "{}" / engine' '"{}"'.format(project, sample, analysis_module.__name__)) #actual analysis launch analysis_module.analyze(project=project, sample=sample, restart_finished_jobs=restart_finished_jobs, restart_running_jobs=restart_running_jobs, keep_existing_data=keep_existing_data, exec_mode=exec_mode, config=config, generate_bqsr_bam=generate_bqsr_bam) except Exception as e: error_text = ('Cannot process project "{}" / sample "{}" / ' 'engine "{}" : {}'.format(project, sample, analysis_module.__name__, e)) LOG.error(error_text) if not config.get("quiet"): mail_analysis(project_name=project.name, sample_name=sample.name, engine_name=analysis_module.__name__, level="ERROR", info_text=e) continue
def analyze(analysis_object, level='sample', config=None, config_file_path=None): """Analyze data at the sample level. :param NGIAnalysis analysis_object: holds all the parameters for the analysis :raises ValueError: If exec_mode is an unsupported value """ charon_session = CharonSession() for sample in analysis_object.project: try: charon_reported_status = charon_session.sample_get( analysis_object.project.project_id, sample).get('analysis_status') # Check Charon to ensure this hasn't already been processed do_analyze = handle_sample_status(analysis_object, sample, charon_reported_status) if not do_analyze: continue except CharonError as e: LOG.error(e) continue if level == "sample": status_field = "alignment_status" elif level == "genotype": status_field = "genotype_status" else: LOG.warn('Unknown workflow level: "{}"'.format(level)) status_field = "alignment_status" # Or should we abort? try: check_for_preexisting_sample_runs( analysis_object.project, sample, analysis_object.restart_running_jobs, analysis_object.restart_finished_jobs, status_field) except RuntimeError as e: raise RuntimeError( 'Aborting processing of project/sample "{}/{}": ' '{}'.format(analysis_object.project, sample, e)) if analysis_object.exec_mode.lower() not in ("sbatch", "local"): raise ValueError( '"exec_mode" param must be one of "sbatch" or "local" ' 'value was "{}"'.format(analysis_object.exec_mode)) if analysis_object.exec_mode == "local": modules_to_load = analysis_object.config.get("piper", {}).get( "load_modules", []) load_modules(modules_to_load) for workflow_subtask in workflows.get_subtasks_for_level(level=level): if level == "genotype": genotype_status = None # Some records in Charon lack this field, I'm guessing try: charon_session = CharonSession() genotype_status = charon_session.sample_get( projectid=analysis_object.project.project_id, sampleid=sample.name).get("genotype_status") except CharonError as e: LOG.error( 'Couldn\'t determine genotyping status for project/' 'sample "{}/{}"; skipping analysis.'.format( analysis_object.project, sample)) continue if find_previous_genotype_analyses( analysis_object.project, sample) or genotype_status == "DONE": if not analysis_object.restart_finished_jobs: LOG.info( 'Project/sample "{}/{}" has completed genotype ' 'analysis previously; skipping (use flag to force ' 'analysis)'.format(analysis_object.project, sample)) continue if analysis_object.restart_running_jobs: # Kill currently-running jobs if they exist kill_running_sample_analysis( workflow_subtask=workflow_subtask, project_id=analysis_object.project.project_id, sample_id=sample.name) # This checks the local jobs database if not is_sample_analysis_running_local( workflow_subtask=workflow_subtask, project_id=analysis_object.project.project_id, sample_id=sample.name): LOG.info('Launching "{}" analysis for sample "{}" in project ' '"{}"'.format(workflow_subtask, sample, analysis_object.project)) try: log_file_path = create_log_file_path( workflow_subtask=workflow_subtask, project_base_path=analysis_object.project.base_path, project_name=analysis_object.project.dirname, project_id=analysis_object.project.project_id, sample_id=sample.name) rotate_file(log_file_path) exit_code_path = create_exit_code_file_path( workflow_subtask=workflow_subtask, project_base_path=analysis_object.project.base_path, project_name=analysis_object.project.dirname, project_id=analysis_object.project.project_id, sample_id=sample.name) if level == "sample": if not analysis_object.keep_existing_data: remove_previous_sample_analyses( analysis_object.project, sample) default_files_to_copy = None elif level == "genotype": if not analysis_object.keep_existing_data: remove_previous_genotype_analyses( analysis_object.project) default_files_to_copy = None # Update the project to keep only valid fastq files for setup.xml creation if level == "genotype": updated_project, default_files_to_copy = \ collect_files_for_sample_analysis(analysis_object.project, sample, restart_finished_jobs=True, status_field="genotype_status") else: updated_project, default_files_to_copy = \ collect_files_for_sample_analysis(analysis_object.project, sample, analysis_object.restart_finished_jobs, status_field="alignment_status") setup_xml_cl, setup_xml_path = build_setup_xml( project=updated_project, sample=sample, workflow=workflow_subtask, local_scratch_mode=( analysis_object.exec_mode == "sbatch"), config=analysis_object.config) piper_cl = build_piper_cl( project=analysis_object.project, workflow_name=workflow_subtask, setup_xml_path=setup_xml_path, exit_code_path=exit_code_path, config=analysis_object.config, exec_mode=analysis_object.exec_mode, generate_bqsr_bam=analysis_object.generate_bqsr_bam) if analysis_object.exec_mode == "sbatch": process_id = None slurm_job_id = sbatch_piper_sample( [setup_xml_cl, piper_cl], workflow_subtask, analysis_object.project, sample, restart_finished_jobs=analysis_object. restart_finished_jobs, files_to_copy=default_files_to_copy) for x in xrange(10): # Time delay to let sbatch get its act together # (takes a few seconds to be visible with sacct) try: get_slurm_job_status(slurm_job_id) break except ValueError: time.sleep(2) else: LOG.error('sbatch file for sample {}/{} did not ' 'queue properly! Job ID {} cannot be ' 'found.'.format(analysis_object.project, sample, slurm_job_id)) else: # "local" raise NotImplementedError( 'Local execution not currently implemented. ' 'I\'m sure Denis can help you with this.') #slurm_job_id = None #launch_piper_job(setup_xml_cl, project) #process_handle = launch_piper_job(piper_cl, project) #process_id = process_handle.pid try: record_process_sample( project=analysis_object.project, sample=sample, analysis_module_name="piper_ngi", slurm_job_id=slurm_job_id, process_id=process_id, workflow_subtask=workflow_subtask) except RuntimeError as e: LOG.error(e) ## Question: should we just kill the run in this case or let it go? continue except (NotImplementedError, RuntimeError, ValueError) as e: error_msg = ( 'Processing project "{}" / sample "{}" / workflow "{}" ' 'failed: {}'.format(analysis_object.project, sample, workflow_subtask, e)) LOG.error(error_msg)
def update_charon_with_local_jobs_status(quiet=False, config=None, config_file_path=None): """Check the status of all locally-tracked jobs and update Charon accordingly. """ if quiet and not config.get("quiet"): config['quiet'] = True LOG.info("Updating Charon with the status of all locally-tracked jobs...") multiqc_projects=set() with get_db_session() as session: charon_session = CharonSession() for sample_entry in session.query(SampleAnalysis).all(): # Local names workflow = sample_entry.workflow project_name = sample_entry.project_name project_id = sample_entry.project_id project_base_path = sample_entry.project_base_path sample_id = sample_entry.sample_id engine = sample_entry.engine # Only one of these id fields (slurm, pid) will have a value slurm_job_id = sample_entry.slurm_job_id process_id = sample_entry.process_id piper_exit_code = get_exit_code(workflow_name=workflow, project_base_path=project_base_path, project_name=project_name, project_id=project_id, sample_id=sample_id) label = "project/sample {}/{}".format(project_name, sample_id) if workflow not in ("merge_process_variantcall", "genotype_concordance",): LOG.error('Unknown workflow "{}" for {}; cannot update ' 'Charon. Skipping sample.'.format(workflow, label)) continue try: project_obj = create_project_obj_from_analysis_log(project_name, project_id, project_base_path, sample_id, workflow) except IOError as e: # analysis log file is missing! error_text = ('Could not find analysis log file! Cannot update ' 'Charon for {} run {}/{}: {}'.format(workflow, project_id, sample_id, e)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text, workflow=workflow) continue try: if piper_exit_code == 0: # 0 -> Job finished successfully if workflow == "merge_process_variantcall": sample_status_field = "analysis_status" seqrun_status_field = "alignment_status" set_status = "ANALYZED" # sample level elif workflow == "genotype_concordance": sample_status_field = seqrun_status_field = "genotype_status" set_status = "DONE" # sample level recurse_status = "DONE" # For the seqrun level info_text = ('Workflow "{}" for {} finished succesfully. ' 'Recording status {} in Charon'.format(workflow, label, set_status)) LOG.info(info_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="INFO", info_text=info_text, workflow=workflow) charon_session.sample_update(projectid=project_id, sampleid=sample_id, **{sample_status_field: set_status}) recurse_status_for_sample(project_obj, status_field=seqrun_status_field, status_value=recurse_status, config=config) # Job is only deleted if the Charon status update succeeds session.delete(sample_entry) #add project to MultiQC multiqc_projects.add((project_base_path, project_id, project_name)) if workflow == "merge_process_variantcall": # Parse seqrun output results / update Charon # This is a semi-optional step -- failure here will send an # email but not more than once. The record is still removed # from the local jobs database, so this will have to be done # manually if you want it done at all. piper_qc_dir = os.path.join(project_base_path, "ANALYSIS", project_id, "piper_ngi", "02_preliminary_alignment_qc") update_coverage_for_sample_seqruns(project_id, sample_id, piper_qc_dir) update_sample_duplication_and_coverage(project_id, sample_id, project_base_path) elif workflow == "genotype_concordance": piper_gt_dir = os.path.join(project_base_path, "ANALYSIS", project_id, "piper_ngi", "03_genotype_concordance") try: update_gtc_for_sample(project_id, sample_id, piper_gt_dir) except (CharonError, IOError, ValueError) as e: LOG.error(e) elif type(piper_exit_code) is int and piper_exit_code > 0: # 1 -> Job failed set_status = "FAILED" error_text = ('Workflow "{}" for {} failed. Recording status ' '{} in Charon.'.format(workflow, label, set_status)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text, workflow=workflow) if workflow == "merge_process_variantcall": sample_status_field = "analysis_status" seqrun_status_field = "alignment_status" elif workflow == "genotype_concordance": sample_status_field = seqrun_status_field = "genotype_status" charon_session.sample_update(projectid=project_id, sampleid=sample_id, **{sample_status_field: set_status}) recurse_status_for_sample(project_obj, status_field=seqrun_status_field, status_value=set_status, config=config) # Job is only deleted if the Charon update succeeds session.delete(sample_entry) else: # None -> Job still running OR exit code was never written (failure) JOB_FAILED = None if slurm_job_id: try: slurm_exit_code = get_slurm_job_status(slurm_job_id) except ValueError as e: slurm_exit_code = 1 if slurm_exit_code is not None: # "None" indicates job is still running JOB_FAILED = True else: if not psutil.pid_exists(process_id): # Job did not write an exit code and is also not running JOB_FAILED = True if JOB_FAILED: set_status = "FAILED" error_text = ('No exit code found but job not running ' 'for {} / {}: setting status to {} in ' 'Charon'.format(label, workflow, set_status)) if slurm_job_id: exit_code_file_path = \ create_exit_code_file_path(workflow_subtask=workflow, project_base_path=project_base_path, project_name=project_name, project_id=project_id, sample_id=sample_id) error_text += (' (slurm job id "{}", exit code file path ' '"{}")'.format(slurm_job_id, exit_code_file_path)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text, workflow=workflow) if workflow == "merge_process_variantcall": sample_status_field = "analysis_status" seqrun_status_field = "alignment_status" elif workflow == "genotype_concordance": sample_status_field = seqrun_status_field = "genotype_status" charon_session.sample_update(projectid=project_id, sampleid=sample_id, **{sample_status_field: set_status}) recurse_status_for_sample(project_obj, status_field=seqrun_status_field, status_value=set_status, config=config) # Job is only deleted if the Charon update succeeds LOG.debug("Deleting local entry {}".format(sample_entry)) session.delete(sample_entry) else: # Job still running set_status = "UNDER_ANALYSIS" if workflow == "merge_process_variantcall": sample_status_field = "analysis_status" seqrun_status_field = "alignment_status" recurse_status = "RUNNING" elif workflow == "genotype_concordance": sample_status_field = seqrun_status_field = "genotype_status" recurse_status = "UNDER_ANALYSIS" try: remote_sample=charon_session.sample_get(projectid=project_id, sampleid=sample_id) charon_status = remote_sample.get(sample_status_field) if charon_status and not charon_status == set_status: LOG.warning('Tracking inconsistency for {}: Charon status ' 'for field "{}" is "{}" but local process tracking ' 'database indicates it is running. Setting value ' 'in Charon to {}.'.format(label, sample_status_field, charon_status, set_status)) charon_session.sample_update(projectid=project_id, sampleid=sample_id, **{sample_status_field: set_status}) recurse_status_for_sample(project_obj, status_field=seqrun_status_field, status_value=recurse_status, config=config) except CharonError as e: error_text = ('Unable to update/verify Charon ' 'for {}: {}'.format(label, e)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", workflow=workflow, info_text=error_text) except CharonError as e: error_text = ('Unable to update Charon for {}: ' '{}'.format(label, e)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", workflow=workflow, info_text=error_text) except OSError as e: error_text = ('Permissions error when trying to update Charon ' '"{}" status for "{}": {}'.format(workflow, label, e)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", workflow=workflow, info_text=error_text) session.commit() #Run Multiqc for pj_tuple in multiqc_projects: LOG.info("Running MultiQC on project {}".format(pj_tuple[1])) run_multiqc(pj_tuple[0], pj_tuple[1], pj_tuple[2])
def update_charon_with_local_jobs_status(quiet=False, config=None, config_file_path=None): """Check the status of all locally-tracked jobs and update Charon accordingly. """ if quiet and not config.get("quiet"): config['quiet'] = True LOG.info("Updating Charon with the status of all locally-tracked jobs...") with get_db_session() as session: charon_session = CharonSession() for sample_entry in session.query(SampleAnalysis).all(): # Local names workflow = sample_entry.workflow project_name = sample_entry.project_name project_id = sample_entry.project_id project_base_path = sample_entry.project_base_path sample_id = sample_entry.sample_id engine = sample_entry.engine # Only one of these id fields (slurm, pid) will have a value slurm_job_id = sample_entry.slurm_job_id process_id = sample_entry.process_id piper_exit_code = get_exit_code(workflow_name=workflow, project_base_path=project_base_path, project_name=project_name, project_id=project_id, sample_id=sample_id) label = "project/sample {}/{}".format(project_name, sample_id) if workflow not in ("merge_process_variantcall", "genotype_concordance",): LOG.error('Unknown workflow "{}" for {}; cannot update ' 'Charon. Skipping sample.'.format(workflow, label)) continue try: project_obj = create_project_obj_from_analysis_log(project_name, project_id, project_base_path, sample_id, workflow) except IOError as e: # analysis log file is missing! error_text = ('Could not find analysis log file! Cannot update ' 'Charon for {} run {}/{}: {}'.format(workflow, project_id, sample_id, e)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text, workflow=workflow) continue try: if piper_exit_code == 0: # 0 -> Job finished successfully if workflow == "merge_process_variantcall": sample_status_field = "analysis_status" seqrun_status_field = "alignment_status" set_status = "ANALYZED" # sample level elif workflow == "genotype_concordance": sample_status_field = seqrun_status_field = "genotype_status" set_status = "DONE" # sample level recurse_status = "DONE" # For the seqrun level info_text = ('Workflow "{}" for {} finished succesfully. ' 'Recording status {} in Charon'.format(workflow, label, set_status)) LOG.info(info_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="INFO", info_text=info_text, workflow=workflow) charon_session.sample_update(projectid=project_id, sampleid=sample_id, **{sample_status_field: set_status}) recurse_status_for_sample(project_obj, status_field=seqrun_status_field, status_value=recurse_status, config=config) # Job is only deleted if the Charon status update succeeds session.delete(sample_entry) #run MultiQC LOG.info("Running MultiQC on project {}".format(project_name)) try: run_multiqc(project_base_path, project_id, project_name) except Exception as e: LOG.error(e) if workflow == "merge_process_variantcall": # Parse seqrun output results / update Charon # This is a semi-optional step -- failure here will send an # email but not more than once. The record is still removed # from the local jobs database, so this will have to be done # manually if you want it done at all. piper_qc_dir = os.path.join(project_base_path, "ANALYSIS", project_id, "piper_ngi", "02_preliminary_alignment_qc") update_coverage_for_sample_seqruns(project_id, sample_id, piper_qc_dir) update_sample_duplication_and_coverage(project_id, sample_id, project_base_path) elif workflow == "genotype_concordance": piper_gt_dir = os.path.join(project_base_path, "ANALYSIS", project_id, "piper_ngi", "03_genotype_concordance") try: update_gtc_for_sample(project_id, sample_id, piper_gt_dir) except (CharonError, IOError, ValueError) as e: LOG.error(e) elif type(piper_exit_code) is int and piper_exit_code > 0: # 1 -> Job failed set_status = "FAILED" error_text = ('Workflow "{}" for {} failed. Recording status ' '{} in Charon.'.format(workflow, label, set_status)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text, workflow=workflow) if workflow == "merge_process_variantcall": sample_status_field = "analysis_status" seqrun_status_field = "alignment_status" elif workflow == "genotype_concordance": sample_status_field = seqrun_status_field = "genotype_status" charon_session.sample_update(projectid=project_id, sampleid=sample_id, **{sample_status_field: set_status}) recurse_status_for_sample(project_obj, status_field=seqrun_status_field, status_value=set_status, config=config) # Job is only deleted if the Charon update succeeds session.delete(sample_entry) else: # None -> Job still running OR exit code was never written (failure) JOB_FAILED = None if slurm_job_id: try: slurm_exit_code = get_slurm_job_status(slurm_job_id) except ValueError as e: slurm_exit_code = 1 if slurm_exit_code is not None: # "None" indicates job is still running JOB_FAILED = True else: if not psutil.pid_exists(process_id): # Job did not write an exit code and is also not running JOB_FAILED = True if JOB_FAILED: set_status = "FAILED" error_text = ('No exit code found but job not running ' 'for {} / {}: setting status to {} in ' 'Charon'.format(label, workflow, set_status)) if slurm_job_id: exit_code_file_path = \ create_exit_code_file_path(workflow_subtask=workflow, project_base_path=project_base_path, project_name=project_name, project_id=project_id, sample_id=sample_id) error_text += (' (slurm job id "{}", exit code file path ' '"{}")'.format(slurm_job_id, exit_code_file_path)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text, workflow=workflow) if workflow == "merge_process_variantcall": sample_status_field = "analysis_status" seqrun_status_field = "alignment_status" elif workflow == "genotype_concordance": sample_status_field = seqrun_status_field = "genotype_status" charon_session.sample_update(projectid=project_id, sampleid=sample_id, **{sample_status_field: set_status}) recurse_status_for_sample(project_obj, status_field=seqrun_status_field, status_value=set_status, config=config) # Job is only deleted if the Charon update succeeds LOG.debug("Deleting local entry {}".format(sample_entry)) session.delete(sample_entry) else: # Job still running set_status = "UNDER_ANALYSIS" if workflow == "merge_process_variantcall": sample_status_field = "analysis_status" seqrun_status_field = "alignment_status" recurse_status = "RUNNING" elif workflow == "genotype_concordance": sample_status_field = seqrun_status_field = "genotype_status" recurse_status = "UNDER_ANALYSIS" try: charon_status = \ charon_session.sample_get(projectid=project_id, sampleid=sample_id).get(sample_status_field) if charon_status and not charon_status == set_status: LOG.warn('Tracking inconsistency for {}: Charon status ' 'for field "{}" is "{}" but local process tracking ' 'database indicates it is running. Setting value ' 'in Charon to {}.'.format(label, sample_status_field, charon_status, set_status)) charon_session.sample_update(projectid=project_id, sampleid=sample_id, **{sample_status_field: set_status}) recurse_status_for_sample(project_obj, status_field=seqrun_status_field, status_value=recurse_status, config=config) except CharonError as e: error_text = ('Unable to update/verify Charon ' 'for {}: {}'.format(label, e)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", workflow=workflow, info_text=error_text) except CharonError as e: error_text = ('Unable to update Charon for {}: ' '{}'.format(label, e)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", workflow=workflow, info_text=error_text) except OSError as e: error_text = ('Permissions error when trying to update Charon ' '"{}" status for "{}": {}'.format(workflow, label, e)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", workflow=workflow, info_text=error_text) session.commit()
from ngi_pipeline.database.communicate import get_project_id_from_name if __name__=="__main__": parser = argparse.ArgumentParser() parser.add_argument("-p", "--project", required=True) parser.add_argument("-s", "--sample", required=True) parser.add_argument("-c", "--coverage", type=int, required=True, dest="required_coverage") args = parser.parse_args() project = args.project sample = args.sample required_coverage = args.required_coverage charon_session = CharonSession() try: reported_coverage = charon_session.sample_get(project, sample).get("total_autosomal_coverage") except CharonError as e: try: project = get_project_id_from_name(project) except (CharonError, RuntimeError, ValueError) as e: print(('ERROR: Could not determine coverage for project {} / sample ' '{}: {}'.format(project, sample, e)), file=sys.stderr) reported_coverage = 0 else: reported_coverage = charon_session.sample_get(project, sample).get("total_autosomal_coverage") if int(reported_coverage) >= int(required_coverage): sys.exit(0) else: sys.exit(1)