def get_finished_seqruns_for_sample(project_id, sample_id, include_failed_libpreps=False): """Find all the finished seqruns for a particular sample. :param str project_id: The id of the project :param str sample_id: The id of the sample :returns: A dict of {libprep_01: [seqrun_01, ..., seqrun_nn], ...} :rtype: dict """ charon_session = CharonSession() sample_libpreps = charon_session.sample_get_libpreps(projectid=project_id, sampleid=sample_id) libpreps = collections.defaultdict(list) for libprep in sample_libpreps['libpreps']: if libprep.get('qc') != "FAILED" or include_failed_libpreps: libprep_id = libprep['libprepid'] for seqrun in charon_session.libprep_get_seqruns(projectid=project_id, sampleid=sample_id, libprepid=libprep_id)['seqruns']: seqrun_id = seqrun['seqrunid'] aln_status = charon_session.seqrun_get(projectid=project_id, sampleid=sample_id, libprepid=libprep_id, seqrunid=seqrun_id).get('alignment_status') if aln_status == "DONE": libpreps[libprep_id].append(seqrun_id) else: LOG.debug('Skipping seqrun "{}" due to alignment_status ' '"{}"'.format(seqrun_id, aln_status)) else: LOG.info('Skipping libprep "{}" due to qc status ' '"{}"'.format(libprep, libprep.get("qc"))) return dict(libpreps)
def update_analysis(project_id, status): charon_session = CharonSession() mail_analysis(project_id, engine_name='rna_ngi', level='INFO' if status else 'ERROR') new_sample_status = 'ANALYZED' if status else 'FAILED' new_seqrun_status = 'DONE' if status else 'FAILED' for sample in charon_session.project_get_samples(project_id).get( "samples", {}): if sample.get('analysis_status') == "UNDER_ANALYSIS": LOG.info("Marking analysis of sample {}/{} as {}".format( project_id, sample.get('sampleid'), new_sample_status)) charon_session.sample_update(project_id, sample.get('sampleid'), analysis_status=new_sample_status) for libprep in charon_session.sample_get_libpreps( project_id, sample.get('sampleid')).get('libpreps', {}): if libprep.get('qc') != 'FAILED': for seqrun in charon_session.libprep_get_seqruns( project_id, sample.get('sampleid'), libprep.get('libprepid')).get('seqruns', {}): if seqrun.get('alignment_status') == "RUNNING": LOG.info( "Marking analysis of seqrun {}/{}/{}/{} as {}". format(project_id, sample.get('sampleid'), libprep.get('libprepid'), seqrun.get('seqrunid'), new_seqrun_status)) charon_session.seqrun_update( project_id, sample.get('sampleid'), libprep.get('libprepid'), seqrun.get('seqrunid'), alignment_status=new_seqrun_status)
def recreate_project_from_db(analysis_top_dir, project_name, project_id): project_dir = os.path.join(analysis_top_dir, "DATA", project_name) project_obj = NGIProject(name=project_name, dirname=project_name, project_id=project_id, base_path=analysis_top_dir) charon_session = CharonSession() try: samples_dict = charon_session.project_get_samples(project_id)["samples"] except CharonError as e: raise RuntimeError("Could not access samples for project {}: {}".format(project_id, e)) for sample in samples_dict: sample_id = sample.get("sampleid") sample_dir = os.path.join(project_dir, sample_id) sample_obj = project_obj.add_sample(name=sample_id, dirname=sample_id) sample_obj.status = sample.get("status", "unknown") try: libpreps_dict = charon_session.sample_get_libpreps(project_id, sample_id)["libpreps"] except CharonError as e: raise RuntimeError("Could not access libpreps for project {} / sample {}: {}".format(project_id,sample_id, e)) for libprep in libpreps_dict: libprep_id = libprep.get("libprepid") libprep_obj = sample_obj.add_libprep(name=libprep_id, dirname=libprep_id) libprep_obj.status = libprep.get("status", "unknown") try: seqruns_dict = charon_session.libprep_get_seqruns(project_id, sample_id, libprep_id)["seqruns"] except CharonError as e: raise RuntimeError("Could not access seqruns for project {} / sample {} / " "libprep {}: {}".format(project_id, sample_id, libprep_id, e)) for seqrun in seqruns_dict: # e.g. 140528_D00415_0049_BC423WACXX seqrun_id = seqrun.get("seqrunid") seqrun_obj = libprep_obj.add_seqrun(name=seqrun_id, dirname=seqrun_id) seqrun_obj.status = seqrun.get("status", "unknown") return project_obj
def check_for_preexisting_sample_runs(project_obj, sample_obj, restart_running_jobs, restart_finished_jobs): """If any analysis is undergoing or has completed for this sample's seqruns, raise a RuntimeError. :param NGIProject project_obj: The project object :param NGISample sample_obj: The sample object :param boolean restart_running_jobs: command line parameter :param boolean restart_finished_jobs: command line parameter :raise RuntimeError if the status is RUNNING or DONE and the flags do not allow to continue """ project_id = project_obj.project_id sample_id = sample_obj.name charon_session = CharonSession() sample_libpreps = charon_session.sample_get_libpreps(projectid=project_id, sampleid=sample_id) for libprep in sample_libpreps['libpreps']: libprep_id = libprep['libprepid'] for seqrun in charon_session.libprep_get_seqruns(projectid=project_id, sampleid=sample_id, libprepid=libprep_id)['seqruns']: seqrun_id = seqrun['seqrunid'] aln_status = charon_session.seqrun_get(projectid=project_id, sampleid=sample_id, libprepid=libprep_id, seqrunid=seqrun_id).get('alignment_status') if (aln_status == "RUNNING" and not restart_running_jobs) or \ (aln_status == "DONE" and not restart_finished_jobs): raise RuntimeError('Project/Sample "{}/{}" has a preexisting ' 'seqrun "{}" with status "{}"'.format(project_obj, sample_obj, seqrun_id, aln_status))
def get_valid_seqruns_for_sample(project_id, sample_id, include_failed_libpreps=False, include_done_seqruns=False, status_field="alignment_status"): """Find all the valid seqruns for a particular sample. :param str project_id: The id of the project :param str sample_id: The id of the sample :param bool include_failed_libpreps: Include seqruns for libreps that have failed QC :param bool include_done_seqruns: Include seqruns that are already marked DONE :returns: A dict of {libprep_01: [seqrun_01, ..., seqrun_nn], ...} :rtype: dict :raises ValueError: If status_field is not a valid value """ valid_status_values = ( "alignment_status", "genotype_status", ) if status_field not in valid_status_values: raise ValueError('"status_field" argument must be one of {} ' '(value passed was "{}")'.format( ", ".join(valid_status_values), status_field)) charon_session = CharonSession() sample_libpreps = charon_session.sample_get_libpreps(projectid=project_id, sampleid=sample_id) libpreps = collections.defaultdict(list) for libprep in sample_libpreps['libpreps']: if libprep.get('qc') != "FAILED" or include_failed_libpreps: libprep_id = libprep['libprepid'] for seqrun in charon_session.libprep_get_seqruns( projectid=project_id, sampleid=sample_id, libprepid=libprep_id)['seqruns']: seqrun_id = seqrun['seqrunid'] try: aln_status = charon_session.seqrun_get( projectid=project_id, sampleid=sample_id, libprepid=libprep_id, seqrunid=seqrun_id)[status_field] except KeyError: LOG.error( 'Field "{}" not available for seqrun "{}" in Charon ' 'for project "{}" / sample "{}". Including as ' 'valid.'.format(status_field, seqrun_id, project_id, sample_id)) aln_status = None if aln_status != "DONE" or include_done_seqruns: libpreps[libprep_id].append(seqrun_id) else: LOG.info('Skipping seqrun "{}" due to {}' '"{}"'.format(seqrun_id, status_field, aln_status)) else: LOG.info('Skipping libprep "{}" due to qc status ' '"{}"'.format(libprep, libprep.get("qc"))) return dict(libpreps)
def reset_charon_records_by_name(project_id, restrict_to_samples=None, restrict_to_libpreps=None, restrict_to_seqruns=None): if not restrict_to_samples: restrict_to_samples = [] if not restrict_to_libpreps: restrict_to_libpreps = [] if not restrict_to_seqruns: restrict_to_seqruns = [] charon_session = CharonSession() LOG.info("Resetting Charon record for project {}".format(project_id)) charon_session.project_reset(projectid=project_id) LOG.info("Charon record for project {} reset".format(project_id)) for sample in charon_session.project_get_samples(projectid=project_id).get('samples', []): sample_id = sample['sampleid'] if restrict_to_samples and sample_id not in restrict_to_samples: LOG.info("Skipping project/sample {}/{}: not in list of samples to use " "({})".format(project_id, sample_id, ", ".join(restrict_to_samples))) continue LOG.info("Resetting Charon record for project/sample {}/{}".format(project_id, sample_id)) charon_session.sample_reset(projectid=project_id, sampleid=sample_id) LOG.info("Charon record for project/sample {}/{} reset".format(project_id, sample_id)) for libprep in charon_session.sample_get_libpreps(projectid=project_id, sampleid=sample_id).get('libpreps', []): libprep_id = libprep['libprepid'] if restrict_to_libpreps and libprep_id not in restrict_to_libpreps: LOG.info("Skipping project/sample/libprep {}/{}/{}: not in list " "of libpreps to use ({})".format(project_id, sample_id, libprep_id, ", ".join(restrict_to_libpreps))) continue LOG.info("Resetting Charon record for project/sample" "libprep {}/{}/{}".format(project_id, sample_id, libprep_id)) charon_session.libprep_reset(projectid=project_id, sampleid=sample_id, libprepid=libprep_id) LOG.info("Charon record for project/sample/libprep {}/{}/{} " "reset".format(project_id, sample_id, libprep_id)) for seqrun in charon_session.libprep_get_seqruns(projectid=project_id, sampleid=sample_id, libprepid=libprep_id).get('seqruns', []): seqrun_id = seqrun['seqrunid'] if restrict_to_seqruns and seqrun_id not in restrict_to_seqruns: LOG.info("Skipping project/sample/libprep/seqrun {}/{}/{}/{}: " "not in list of seqruns to use ({})".format(project_id, sample_id, libprep_id, seqrun_id, ", ".join(restrict_to_seqruns))) continue LOG.info("Resetting Charon record for project/sample/libprep/" "seqrun {}/{}/{}/{}".format(project_id, sample_id, libprep_id, seqrun_id)) charon_session.seqrun_reset(projectid=project_id, sampleid=sample_id, libprepid=libprep_id, seqrunid=seqrun_id) LOG.info("Charon record for project/sample/libprep/seqrun " "{}/{}/{}/{} reset".format(project_id, sample_id, libprep_id, seqrun_id))
def get_valid_seqruns_for_sample(project_id, sample_id, include_failed_libpreps=False, include_done_seqruns=False, status_field="alignment_status"): """Find all the valid seqruns for a particular sample. :param str project_id: The id of the project :param str sample_id: The id of the sample :param bool include_failed_libpreps: Include seqruns for libreps that have failed QC :param bool include_done_seqruns: Include seqruns that are already marked DONE :returns: A dict of {libprep_01: [seqrun_01, ..., seqrun_nn], ...} :rtype: dict :raises ValueError: If status_field is not a valid value """ valid_status_values = ("alignment_status", "genotype_status",) if status_field not in valid_status_values: raise ValueError('"status_field" argument must be one of {} ' '(value passed was "{}")'.format(", ".join(valid_status_values), status_field)) charon_session = CharonSession() sample_libpreps = charon_session.sample_get_libpreps(projectid=project_id, sampleid=sample_id) libpreps = collections.defaultdict(list) for libprep in sample_libpreps['libpreps']: if libprep.get('qc') != "FAILED" or include_failed_libpreps: libprep_id = libprep['libprepid'] for seqrun in charon_session.libprep_get_seqruns(projectid=project_id, sampleid=sample_id, libprepid=libprep_id)['seqruns']: seqrun_id = seqrun['seqrunid'] try: aln_status = charon_session.seqrun_get(projectid=project_id, sampleid=sample_id, libprepid=libprep_id, seqrunid=seqrun_id)[status_field] except KeyError: LOG.error('Field "{}" not available for seqrun "{}" in Charon ' 'for project "{}" / sample "{}". Including as ' 'valid.'.format(status_field, seqrun_id, project_id, sample_id)) aln_status = None if aln_status != "DONE" or include_done_seqruns: libpreps[libprep_id].append(seqrun_id) else: LOG.info('Skipping seqrun "{}" due to {}' '"{}"'.format(seqrun_id,status_field, aln_status)) else: LOG.info('Skipping libprep "{}" due to qc status ' '"{}"'.format(libprep, libprep.get("qc"))) return dict(libpreps)
def main(project): charon_session = CharonSession() samples = charon_session.project_get_samples(project) for sample in samples["samples"]: charon_session.sample_update(project, sample["sampleid"], analysis_status= "TO_ANALYZE", genotype_status=None, total_autosomal_coverage="0", total_sequenced_reads="0") for sample_prep in charon_session.sample_get_libpreps(project, sample["sampleid"])['libpreps']: seqruns = charon_session.libprep_get_seqruns(project, sample["sampleid"], sample_prep["libprepid"])['seqruns'] for seqrun in seqruns: charon_session.seqrun_update(project, sample["sampleid"], sample_prep["libprepid"], seqrun["seqrunid"], mean_autosomal_coverage = "0", alignment_status = "NOT_RUNNING")
def determine_library_prep_from_fcid(project_id, sample_name, fcid): """Use the information in the database to get the library prep id from the project name, sample name, and flowcell id. :param str project_id: The ID of the project :param str sample_name: The name of the sample :param str fcid: The flowcell ID :returns: The library prep (e.g. "A") :rtype str :raises ValueError: If no match was found. """ charon_session = CharonSession() try: libpreps = charon_session.sample_get_libpreps(project_id, sample_name)['libpreps'] if libpreps: for libprep in libpreps: # Get the sequencing runs and see if they match the FCID we have seqruns = charon_session.libprep_get_seqruns(project_id, sample_name, libprep['libprepid'])['seqruns'] if seqruns: for seqrun in seqruns: seqrun_runid = seqrun["seqrunid"] if seqrun_runid == fcid: ## BUG if we have one sample with two libpreps on the same flowcell, ## this just picks the first one it encounters; instead, ## it should raise an Exception. Requires restructuring. return libprep['libprepid'] else: raise CharonError("No seqruns found!", 404) else: raise CharonError("No match", 404) else: raise CharonError("No libpreps found!", 404) except CharonError as e: if e.status_code == 404: raise ValueError('No library prep found for project "{}" / sample "{}" ' '/ fcid "{}"'.format(project_id, sample_name, fcid)) else: raise ValueError('Could not determine library prep for project "{}" ' '/ sample "{}" / fcid "{}": {}'.format(project_id, sample_name, fcid, e))
def determine_library_prep_from_fcid(project_id, sample_name, fcid): """Use the information in the database to get the library prep id from the project name, sample name, and flowcell id. :param str project_id: The ID of the project :param str sample_name: The name of the sample :param str fcid: The flowcell ID :returns: The library prep (e.g. "A") :rtype str :raises ValueError: If no match was found. """ charon_session = CharonSession() try: libpreps = charon_session.sample_get_libpreps(project_id, sample_name)["libpreps"] if libpreps: for libprep in libpreps: # Get the sequencing runs and see if they match the FCID we have seqruns = charon_session.libprep_get_seqruns(project_id, sample_name, libprep["libprepid"])["seqruns"] if seqruns: for seqrun in seqruns: seqrun_runid = seqrun["seqrunid"] if seqrun_runid == fcid: ## BUG if we have one sample with two libpreps on the same flowcell, ## this just picks the first one it encounters; instead, ## it should raise an Exception. Requires restructuring. return libprep["libprepid"] else: raise CharonError("No seqruns found!", 404) else: raise CharonError("No match", 404) else: raise CharonError("No libpreps found!", 404) except CharonError as e: if e.status_code == 404: raise ValueError( 'No library prep found for project "{}" / sample "{}" ' '/ fcid "{}"'.format(project_id, sample_name, fcid) ) else: raise ValueError( 'Could not determine library prep for project "{}" ' '/ sample "{}" / fcid "{}": {}'.format(project_id, sample_name, fcid, e) )
def determine_library_prep_from_fcid(project_id, sample_name, fcid): """Use the information in the database to get the library prep id from the project name, sample name, and flowcell id. :param str project_id: The ID of the project :param str sample_name: The name of the sample :param str fcid: The flowcell ID :returns: The library prep (e.g. "A") :rtype str :raises ValueError: If no match was found. """ charon_session = CharonSession() try: libpreps = charon_session.sample_get_libpreps(project_id, sample_name)['libpreps'] if libpreps: for libprep in libpreps: # Get the sequencing runs and see if they match the FCID we have seqruns = charon_session.libprep_get_seqruns(project_id, sample_name, libprep['libprepid'])['seqruns'] if seqruns: for seqrun in seqruns: seqrun_runid = seqrun["seqrunid"] if seqrun_runid == fcid: return libprep['libprepid'] else: raise CharonError("No match", 404) else: raise CharonError("No seqruns found!", 404) else: raise CharonError("No libpreps found!", 404) except CharonError as e: if e.status_code == 404: raise ValueError('No library prep found for project "{}" / sample "{}" ' '/ fcid "{}"'.format(project_id, sample_name, fcid)) else: raise ValueError('Could not determine library prep for project "{}" ' '/ sample "{}" / fcid "{}": {}'.format(project_id, sample_name, fcid, e))
def main(project): charon_session = CharonSession() samples = charon_session.project_get_samples(project) for sample in samples["samples"]: charon_session.sample_update(project, sample["sampleid"], analysis_status="TO_ANALYZE", genotype_status=None, total_autosomal_coverage="0", total_sequenced_reads="0") for sample_prep in charon_session.sample_get_libpreps( project, sample["sampleid"])['libpreps']: seqruns = charon_session.libprep_get_seqruns( project, sample["sampleid"], sample_prep["libprepid"])['seqruns'] for seqrun in seqruns: charon_session.seqrun_update(project, sample["sampleid"], sample_prep["libprepid"], seqrun["seqrunid"], mean_autosomal_coverage="0", alignment_status="NOT_RUNNING")
def setup_analysis_directory_structure(fc_dir, projects_to_analyze, restrict_to_projects=None, restrict_to_samples=None, create_files=True, fallback_libprep=None, quiet=False, config=None, config_file_path=None): """ Copy and sort files from their CASAVA-demultiplexed flowcell structure into their respective project/sample/libPrep/FCIDs. This collects samples split across multiple flowcells. :param str fc_dir: The directory created by CASAVA for this flowcell. :param dict config: The parsed configuration file. :param set projects_to_analyze: A dict (of Project objects, or empty) :param bool create_files: Alter the filesystem (as opposed to just parsing flowcells) (default True) :param str fallback_libprep: If libprep cannot be determined, use this value if supplied (default None) :param list restrict_to_projects: Specific projects within the flowcell to process exclusively :param list restrict_to_samples: Specific samples within the flowcell to process exclusively :returns: A list of NGIProject objects that need to be run through the analysis pipeline :rtype: list :raises KeyError: If a required configuration key is not available. """ LOG.info("Setting up analysis for demultiplexed data in source folder \"{}\"".format(fc_dir)) if not restrict_to_projects: restrict_to_projects = [] if not restrict_to_samples: restrict_to_samples = [] config["quiet"] = quiet # Hack because I enter here from a script sometimes pattern="(.+(?:{}|{}))\/.+".format(config["analysis"]["sthlm_root"], config["analysis"]["upps_root"]) matches=re.match(pattern, fc_dir) if matches: flowcell_root=matches.group(1) else: LOG.error("cannot guess which project the flowcell {} belongs to".format(fc_dir)) raise RuntimeError analysis_top_dir = os.path.abspath(os.path.join(flowcell_root,config["analysis"]["top_dir"])) try: safe_makedir(analysis_top_dir) except OSError as e: LOG.error('Error: Analysis top directory {} does not exist and could not ' 'be created.'.format(analysis_top_dir)) fc_dir = fc_dir if os.path.isabs(fc_dir) else os.path.join(analysis_top_dir, fc_dir) if not os.path.exists(fc_dir): LOG.error("Error: Flowcell directory {} does not exist".format(fc_dir)) return [] # Map the directory structure for this flowcell try: fc_dir_structure = parse_flowcell(fc_dir) except (OSError, ValueError) as e: LOG.error("Error when processing flowcell dir \"{}\": {}".format(fc_dir, e)) return [] fc_full_id = fc_dir_structure['fc_full_id'] if not fc_dir_structure.get('projects'): LOG.warn("No projects found in specified flowcell directory \"{}\"".format(fc_dir)) # Iterate over the projects in the flowcell directory for project in fc_dir_structure.get('projects', []): project_name = project['project_name'] project_original_name = project['project_original_name'] samplesheet_path = fc_dir_structure.get("samplesheet_path") try: # Maps e.g. "Y.Mom_14_01" to "P123" project_id = get_project_id_from_name(project_name) except (CharonError, RuntimeError, ValueError) as e: LOG.warn('Could not retrieve project id from Charon (record missing?). ' 'Using project name ("{}") as project id ' '(error: {})'.format(project_name, e)) project_id = project_name # If specific projects are specified, skip those that do not match if restrict_to_projects and project_name not in restrict_to_projects and \ project_id not in restrict_to_projects: LOG.debug("Skipping project {} (not in restrict_to_projects)".format(project_name)) continue LOG.info("Setting up project {}".format(project.get("project_name"))) # Create a project directory if it doesn't already exist, including # intervening "DATA" directory project_dir = os.path.join(analysis_top_dir, "DATA", project_id) project_sl_dir = os.path.join(analysis_top_dir, "DATA", project_name) project_analysis_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_id) project_analysis_sl_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_name) if create_files: safe_makedir(project_dir, 0o2770) safe_makedir(project_analysis_dir, 0o2770) if not project_dir == project_sl_dir and \ not os.path.exists(project_sl_dir): os.symlink(project_dir, project_sl_dir) if not project_analysis_dir == project_analysis_sl_dir and \ not os.path.exists(project_analysis_sl_dir): os.symlink(project_analysis_dir, project_analysis_sl_dir) try: project_obj = projects_to_analyze[project_dir] except KeyError: project_obj = NGIProject(name=project_name, dirname=project_id, project_id=project_id, base_path=analysis_top_dir) projects_to_analyze[project_dir] = project_obj # Iterate over the samples in the project for sample in project.get('samples', []): sample_name = sample['sample_name'] # If specific samples are specified, skip those that do not match if restrict_to_samples and sample_name not in restrict_to_samples: LOG.debug("Skipping sample {}: not in specified samples " "{}".format(sample_name, ", ".join(restrict_to_samples))) continue LOG.info("Setting up sample {}".format(sample_name)) # Create a directory for the sample if it doesn't already exist sample_dir = os.path.join(project_dir, sample_name) if create_files: safe_makedir(sample_dir, 0o2770) # This will only create a new sample object if it doesn't already exist in the project sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name) # Get the Library Prep ID for each file pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$") fastq_files = filter(pattern.match, sample.get('files', [])) # For each fastq file, create the libprep and seqrun objects # and add the fastq file to the seqprep object # Note again that these objects only get created if they don't yet exist; # if they do exist, the existing object is returned for fq_file in fastq_files: # Try to parse from SampleSheet try: if not samplesheet_path: raise ValueError() lane_num = re.match(r'[\w-]+_L\d{2}(\d)_\w+', fq_file).groups()[0] libprep_name = determine_library_prep_from_samplesheet(samplesheet_path, project_original_name, sample_name, lane_num) except (IndexError, ValueError) as e: LOG.debug('Unable to determine library prep from sample sheet file ' '("{}"); try to determine from Charon'.format(e)) try: # Requires Charon access libprep_name = determine_library_prep_from_fcid(project_id, sample_name, fc_full_id) LOG.debug('Found libprep name "{}" in Charon'.format(libprep_name)) except ValueError: charon_session = CharonSession() libpreps = charon_session.sample_get_libpreps(project_id, sample_name).get('libpreps') if len(libpreps) == 1: libprep_name = libpreps[0].get('libprepid') LOG.warn('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but only one ' 'library prep is present in Charon ("{}"). Using ' 'this as the library prep.'.format(project_name, sample_name, fc_full_id, fq_file, libprep_name)) elif fallback_libprep: libprep_name = fallback_libprep LOG.warn('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but a fallback ' 'libprep value of "{}" was supplied -- using this ' 'value.'.format(project_name, sample_name, fc_full_id, fq_file, libprep_name, fallback_libprep)) else: error_text = ('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon. Skipping ' 'analysis.'.format(project_name, sample_name, fc_full_id, fq_file)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue libprep_object = sample_obj.add_libprep(name=libprep_name, dirname=libprep_name) libprep_dir = os.path.join(sample_dir, libprep_name) if create_files: safe_makedir(libprep_dir, 0o2770) seqrun_object = libprep_object.add_seqrun(name=fc_full_id, dirname=fc_full_id) seqrun_dir = os.path.join(libprep_dir, fc_full_id) if create_files: safe_makedir(seqrun_dir, 0o2770) seqrun_object.add_fastq_files(fq_file) if fastq_files and create_files: src_sample_dir = os.path.join(fc_dir_structure['fc_dir'], project['data_dir'], project['project_dir'], sample['sample_dir']) for libprep_obj in sample_obj: for seqrun_obj in libprep_obj: src_fastq_files = [os.path.join(src_sample_dir, fastq_file) for fastq_file in seqrun_obj.fastq_files] seqrun_dst_dir = os.path.join(project_obj.base_path, project_obj.dirname, sample_obj.dirname, libprep_obj.dirname, seqrun_obj.dirname) LOG.info("Symlinking fastq files from {} to {}...".format(src_sample_dir, seqrun_dir)) try: do_symlink(src_fastq_files, seqrun_dir) except OSError: error_text = ('Could not symlink files for project/sample' 'libprep/seqrun {}/{}/{}/{}'.format(project_obj, sample_obj, libprep_obj, seqrun_obj)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue return projects_to_analyze
def setup_analysis_directory_structure(fc_dir, projects_to_analyze, restrict_to_projects=None, restrict_to_samples=None, create_files=True, fallback_libprep=None, quiet=False, config=None, config_file_path=None): """ Copy and sort files from their CASAVA-demultiplexed flowcell structure into their respective project/sample/libPrep/FCIDs. This collects samples split across multiple flowcells. :param str fc_dir: The directory created by CASAVA for this flowcell. :param dict config: The parsed configuration file. :param set projects_to_analyze: A dict (of Project objects, or empty) :param bool create_files: Alter the filesystem (as opposed to just parsing flowcells) (default True) :param str fallback_libprep: If libprep cannot be determined, use this value if supplied (default None) :param list restrict_to_projects: Specific projects within the flowcell to process exclusively :param list restrict_to_samples: Specific samples within the flowcell to process exclusively :returns: A list of NGIProject objects that need to be run through the analysis pipeline :rtype: list :raises KeyError: If a required configuration key is not available. """ LOG.info( "Setting up analysis for demultiplexed data in source folder \"{}\"". format(fc_dir)) if not restrict_to_projects: restrict_to_projects = [] if not restrict_to_samples: restrict_to_samples = [] config[ "quiet"] = quiet # Hack because I enter here from a script sometimes #Checks flowcell path to establish which group owns it pattern = ".+({}|{})\/.+".format(config["analysis"]["sthlm_root"], config["analysis"]["upps_root"]) matches = re.match(pattern, fc_dir) if matches: flowcell_uppnexid = matches.group(1) else: LOG.error( "cannot guess which project (sthlm/uppsala) the flowcell {} belongs to" .format(fc_dir)) raise RuntimeError analysis_top_dir = os.path.abspath( os.path.join(config["analysis"]["base_root"], flowcell_uppnexid, config["analysis"]["top_dir"])) try: safe_makedir(analysis_top_dir) except OSError as e: LOG.error( 'Error: Analysis top directory {} does not exist and could not ' 'be created.'.format(analysis_top_dir)) fc_dir = fc_dir if os.path.isabs(fc_dir) else os.path.join( analysis_top_dir, fc_dir) if not os.path.exists(fc_dir): LOG.error("Error: Flowcell directory {} does not exist".format(fc_dir)) return [] # Map the directory structure for this flowcell try: fc_dir_structure = parse_flowcell(fc_dir) except (OSError, ValueError) as e: LOG.error("Error when processing flowcell dir \"{}\": {}".format( fc_dir, e)) return [] fc_full_id = fc_dir_structure['fc_full_id'] if not fc_dir_structure.get('projects'): LOG.warning( "No projects found in specified flowcell directory \"{}\"".format( fc_dir)) # Iterate over the projects in the flowcell directory for project in fc_dir_structure.get('projects', []): project_name = project['project_name'] project_original_name = project['project_original_name'] samplesheet_path = fc_dir_structure.get("samplesheet_path") # parse the samplesheet and get the expected sample numbers assigned by bcl2fastq samplesheet_sample_numbers = get_sample_numbers_from_samplesheet( samplesheet_path) if samplesheet_path else None try: # Maps e.g. "Y.Mom_14_01" to "P123" project_id = get_project_id_from_name(project_name) except (CharonError, RuntimeError, ValueError) as e: LOG.warning( 'Could not retrieve project id from Charon (record missing?). ' 'Using project name ("{}") as project id ' '(error: {})'.format(project_name, e)) project_id = project_name # If specific projects are specified, skip those that do not match if restrict_to_projects and project_name not in restrict_to_projects and \ project_id not in restrict_to_projects: LOG.debug( "Skipping project {} (not in restrict_to_projects)".format( project_name)) continue LOG.info("Setting up project {}".format(project.get("project_name"))) # Create a project directory if it doesn't already exist, including # intervening "DATA" directory project_dir = os.path.join(analysis_top_dir, "DATA", project_id) project_sl_dir = os.path.join(analysis_top_dir, "DATA", project_name) project_analysis_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_id) project_analysis_sl_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_name) if create_files: safe_makedir(project_dir, 0o2770) safe_makedir(project_analysis_dir, 0o2770) if not project_dir == project_sl_dir and \ not os.path.exists(project_sl_dir): os.symlink(project_dir, project_sl_dir) if not project_analysis_dir == project_analysis_sl_dir and \ not os.path.exists(project_analysis_sl_dir): os.symlink(project_analysis_dir, project_analysis_sl_dir) try: project_obj = projects_to_analyze[project_dir] except KeyError: project_obj = NGIProject(name=project_name, dirname=project_id, project_id=project_id, base_path=analysis_top_dir) projects_to_analyze[project_dir] = project_obj # Iterate over the samples in the project for sample in project.get('samples', []): sample_name = sample['sample_name'] # If specific samples are specified, skip those that do not match if restrict_to_samples and sample_name not in restrict_to_samples: LOG.debug("Skipping sample {}: not in specified samples " "{}".format(sample_name, ", ".join(restrict_to_samples))) continue LOG.info("Setting up sample {}".format(sample_name)) # Create a directory for the sample if it doesn't already exist sample_dir = os.path.join(project_dir, sample_name) if create_files: safe_makedir(sample_dir, 0o2770) # This will only create a new sample object if it doesn't already exist in the project sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name) # Get the Library Prep ID for each file pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$") fastq_files = list(filter(pattern.match, sample.get('files', []))) # For each fastq file, create the libprep and seqrun objects # and add the fastq file to the seqprep object # Note again that these objects only get created if they don't yet exist; # if they do exist, the existing object is returned for fq_file in fastq_files: # Try to use assignment from SampleSheet samplesheet_sample = match_fastq_sample_number_to_samplesheet( fq_file, samplesheet_sample_numbers, project_id) if samplesheet_sample is not None and \ samplesheet_sample[6] is not None: libprep_name = samplesheet_sample[6] else: LOG.debug( 'Unable to determine library prep from sample sheet file; try to determine from Charon' ) try: # Requires Charon access libprep_name = determine_library_prep_from_fcid( project_id, sample_name, fc_full_id) LOG.debug('Found libprep name "{}" in Charon'.format( libprep_name)) except ValueError: charon_session = CharonSession() libpreps = charon_session.sample_get_libpreps( project_id, sample_name).get('libpreps') if len(libpreps) == 1: libprep_name = libpreps[0].get('libprepid') LOG.warning( 'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but only one ' 'library prep is present in Charon ("{}"). Using ' 'this as the library prep.'.format( project_name, sample_name, fc_full_id, fq_file, libprep_name)) elif fallback_libprep: libprep_name = fallback_libprep LOG.warning( 'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but a fallback ' 'libprep value of "{}" was supplied -- using this ' 'value.'.format(project_name, sample_name, fc_full_id, fq_file, libprep_name)) else: error_text = ( 'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon. Skipping ' 'analysis.'.format(project_name, sample_name, fc_full_id, fq_file)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue libprep_object = sample_obj.add_libprep(name=libprep_name, dirname=libprep_name) libprep_dir = os.path.join(sample_dir, libprep_name) if create_files: safe_makedir(libprep_dir, 0o2770) seqrun_object = libprep_object.add_seqrun(name=fc_full_id, dirname=fc_full_id) seqrun_dir = os.path.join(libprep_dir, fc_full_id) if create_files: safe_makedir(seqrun_dir, 0o2770) seqrun_object.add_fastq_files(fq_file) if fastq_files and create_files: src_sample_dir = os.path.join(fc_dir_structure['fc_dir'], project['data_dir'], project['project_dir'], sample['sample_dir']) for libprep_obj in sample_obj: for seqrun_obj in libprep_obj: src_fastq_files = [ os.path.join(src_sample_dir, fastq_file) for fastq_file in seqrun_obj.fastq_files ] seqrun_dst_dir = os.path.join(project_obj.base_path, "DATA", project_obj.dirname, sample_obj.dirname, libprep_obj.dirname, seqrun_obj.dirname) LOG.info( "Symlinking fastq files from {} to {}...".format( src_sample_dir, seqrun_dst_dir)) try: do_symlink(src_fastq_files, seqrun_dst_dir) except OSError: error_text = ( 'Could not symlink files for project/sample' 'libprep/seqrun {}/{}/{}/{}'.format( project_obj, sample_obj, libprep_obj, seqrun_obj)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue return projects_to_analyze
def project_summarize(projects, verbosity=0): if type(verbosity) is not int or verbosity < 0: print_stderr('Invalid verbosity level ("{}"); must be a positive ' 'integer; falling back to 0') verbosity = 0 update_charon_with_local_jobs_status(quiet=True) # Don't send mails charon_session = CharonSession() projects_list = [] for project in projects: try: project = os.path.basename(locate_project(project)) except ValueError as e: print_stderr("Skipping project: {}".format(e)) continue print_stderr( 'Gathering information for project "{}"...'.format(project)) project_dict = {} try: project = charon_session.project_get(project) except CharonError as e: print_stderr( 'Project "{}" not found in Charon; skipping ({})'.format( project, e), file=sys.stderr) continue project_dict['name'] = project['name'] project_dict['id'] = project['projectid'] project_dict['status'] = project['status'] samples_list = project_dict['samples'] = [] for sample in charon_session.project_get_samples( project['projectid']).get('samples', []): sample_dict = {} sample_dict['id'] = sample['sampleid'] sample_dict['analysis_status'] = sample['analysis_status'] sample_dict['coverage'] = sample['total_autosomal_coverage'] libpreps_list = sample_dict['libpreps'] = [] samples_list.append(sample_dict) for libprep in charon_session.sample_get_libpreps( project['projectid'], sample['sampleid']).get('libpreps', []): libprep_dict = {} libprep_dict['id'] = libprep['libprepid'] libprep_dict['qc'] = libprep['qc'] seqruns_list = libprep_dict['seqruns'] = [] libpreps_list.append(libprep_dict) for seqrun in charon_session.libprep_get_seqruns( project['projectid'], sample['sampleid'], libprep['libprepid']).get('seqruns', []): seqrun_dict = {} seqrun_dict['id'] = seqrun['seqrunid'] seqrun_dict['alignment_status'] = seqrun[ 'alignment_status'] seqrun_dict['coverage'] = seqrun['mean_autosomal_coverage'] if seqrun.get('total_reads'): seqrun_dict['total_reads'] = seqrun['total_reads'] seqruns_list.append(seqrun_dict) projects_list.append(project_dict) if verbosity in (0, 1): projects_status_list = [] #projects_by_status = collections.defaultdict(dict) #samples_by_status = collections.defaultdict(set) #libpreps_by_status = collections.defaultdict(set) #seqruns_by_status = collections.defaultdict(set) for project_dict in projects_list: project_status_dict = {} project_status_dict['name'] = "{} ({})".format( project_dict['name'], project_dict['id']) project_status_dict['status'] = project_dict['status'] samples_by_status = project_status_dict[ 'samples_by_status'] = collections.defaultdict(set) libpreps_by_status = project_status_dict[ 'libpreps_by_status'] = collections.defaultdict(set) seqruns_by_status = project_status_dict[ 'seqruns_by_status'] = collections.defaultdict(set) for sample_dict in project_dict.get('samples', []): #samples_by_status[sample_dict['analysis_status']].add(sample_dict['id']) sample_status = sample_dict['analysis_status'] libpreps = sample_dict.get('libpreps') if libpreps: if not any([libprep["seqruns"] for libprep in libpreps]): sample_status = "NO_SEQRUNS" else: for libprep_dict in libpreps: libpreps_by_status[libprep_dict['qc']].add( libprep_dict['id']) for seqrun_dict in libprep_dict.get('seqruns', []): seqruns_by_status[ seqrun_dict['alignment_status']].add( seqrun_dict['id']) else: sample_status = "NO_LIBPREPS" samples_by_status[sample_status].add(sample_dict['id']) projects_status_list.append(project_status_dict) print_items = ( ("Samples", "samples_by_status"), ("Libpreps", "libpreps_by_status"), ("Seqruns", "seqruns_by_status"), ) for project_dict in projects_status_list: print_stderr("\nProject\n-------") print_stderr(" Name: {:>40}".format(project_dict['name'])) print_stderr(" Status: {:>40}".format(project_dict['status'])) for name, dict_key in print_items: status_dict = project_dict[dict_key] print_stderr("{}\n{}".format(name, "-" * len(name))) total_items = sum(map(len, status_dict.values())) # Sort by analysis value for status, item_set in sorted( status_dict.iteritems(), key=lambda key_value: key_value[0]): num_items = len(item_set) percent = (100.00 * num_items) / total_items print_stderr( " Status: {:<20} ({:>3}/{:<3}) ({:>6.2f}%)".format( status, num_items, total_items, percent)) if verbosity == 1: for item in sorted(item_set): print_stderr(" {}".format(item)) print_stderr("") else: # Verbosity is 2+, maximum verbosity output_template = "{}{:<30}{:>{rspace}}" for project_dict in projects_list: offset = 0 indent = " " * offset rspace = 80 - offset print_stderr( output_template.format(indent, "Project name:", project_dict['name'], rspace=rspace)) print_stderr( output_template.format(indent, "Project ID:", project_dict['id'], rspace=rspace)) print_stderr( output_template.format(indent, "Project status:", project_dict['status'], rspace=rspace)) for sample_dict in project_dict['samples']: print_stderr("") offset = 4 indent = " " * offset rspace = 80 - offset print_stderr( output_template.format(indent, "Sample ID:", sample_dict['id'], rspace=rspace)) print_stderr( output_template.format(indent, "Sample analysis status:", sample_dict['analysis_status'], rspace=rspace)) print_stderr( output_template.format(indent, "Sample coverage:", sample_dict['coverage'], rspace=rspace)) for libprep_dict in sample_dict['libpreps']: print_stderr("") offset = 8 indent = " " * offset rspace = 80 - offset print_stderr( output_template.format(indent, "Libprep ID:", libprep_dict['id'], rspace=rspace)) print_stderr( output_template.format(indent, "Libprep qc status:", libprep_dict['qc'], rspace=rspace)) for seqrun_dict in libprep_dict['seqruns']: print_stderr("") offset = 12 indent = " " * offset rspace = 80 - offset print_stderr( output_template.format(indent, "Seqrun ID:", seqrun_dict['id'], rspace=rspace)) print_stderr( output_template.format( indent, "Seqrun alignment status:", seqrun_dict['alignment_status'], rspace=rspace)) print_stderr( output_template.format( indent, "Seqrun mean auto. coverage:", seqrun_dict['coverage'], rspace=rspace)) if "total_reads" in seqrun_dict: print_stderr( output_template.format( indent, "Seqrun total reads:", seqrun_dict['total_reads'], rspace=rspace)) print_stderr("\n")
def project_summarize(projects, verbosity=0): if type(verbosity) is not int or verbosity < 0: print_stderr('Invalid verbosity level ("{}"); must be a positive ' 'integer; falling back to 0') verbosity = 0 update_charon_with_local_jobs_status(quiet=True) # Don't send mails charon_session = CharonSession() projects_list = [] for project in projects: try: project = os.path.basename(locate_project(project)) except ValueError as e: print_stderr("Skipping project: {}".format(e)) continue print_stderr('Gathering information for project "{}"...'.format(project)) project_dict = {} try: project = charon_session.project_get(project) except CharonError as e: print_stderr('Project "{}" not found in Charon; skipping ({})'.format(project, e), file=sys.stderr) continue project_dict['name'] = project['name'] project_dict['id'] = project['projectid'] project_dict['status'] = project['status'] samples_list = project_dict['samples'] = [] for sample in charon_session.project_get_samples(project['projectid']).get('samples', []): sample_dict = {} sample_dict['id'] = sample['sampleid'] sample_dict['analysis_status'] = sample['analysis_status'] sample_dict['coverage'] = sample['total_autosomal_coverage'] libpreps_list = sample_dict['libpreps'] = [] samples_list.append(sample_dict) for libprep in charon_session.sample_get_libpreps(project['projectid'], sample['sampleid']).get('libpreps', []): libprep_dict = {} libprep_dict['id'] = libprep['libprepid'] libprep_dict['qc'] = libprep['qc'] seqruns_list = libprep_dict['seqruns'] = [] libpreps_list.append(libprep_dict) for seqrun in charon_session.libprep_get_seqruns(project['projectid'], sample['sampleid'], libprep['libprepid']).get('seqruns', []): seqrun_dict = {} seqrun_dict['id'] = seqrun['seqrunid'] seqrun_dict['alignment_status'] = seqrun['alignment_status'] seqrun_dict['coverage'] = seqrun['mean_autosomal_coverage'] if seqrun.get('total_reads'): seqrun_dict['total_reads'] = seqrun['total_reads'] seqruns_list.append(seqrun_dict) projects_list.append(project_dict) if verbosity in (0, 1): projects_status_list = [] #projects_by_status = collections.defaultdict(dict) #samples_by_status = collections.defaultdict(set) #libpreps_by_status = collections.defaultdict(set) #seqruns_by_status = collections.defaultdict(set) for project_dict in projects_list: project_status_dict = {} project_status_dict['name'] = "{} ({})".format(project_dict['name'], project_dict['id']) project_status_dict['status'] = project_dict['status'] samples_by_status = project_status_dict['samples_by_status'] = collections.defaultdict(set) libpreps_by_status = project_status_dict['libpreps_by_status'] = collections.defaultdict(set) seqruns_by_status = project_status_dict['seqruns_by_status'] = collections.defaultdict(set) for sample_dict in project_dict.get('samples', []): #samples_by_status[sample_dict['analysis_status']].add(sample_dict['id']) sample_status = sample_dict['analysis_status'] libpreps = sample_dict.get('libpreps') if libpreps: if not any([libprep["seqruns"] for libprep in libpreps]): sample_status = "NO_SEQRUNS" else: for libprep_dict in libpreps: libpreps_by_status[libprep_dict['qc']].add(libprep_dict['id']) for seqrun_dict in libprep_dict.get('seqruns', []): seqruns_by_status[seqrun_dict['alignment_status']].add(seqrun_dict['id']) else: sample_status = "NO_LIBPREPS" samples_by_status[sample_status].add(sample_dict['id']) projects_status_list.append(project_status_dict) print_items = (("Samples", "samples_by_status"), ("Libpreps", "libpreps_by_status"), ("Seqruns", "seqruns_by_status"),) for project_dict in projects_status_list: print_stderr("\nProject\n-------") print_stderr(" Name: {:>40}".format(project_dict['name'])) print_stderr(" Status: {:>40}".format(project_dict['status'])) for name, dict_key in print_items: status_dict = project_dict[dict_key] print_stderr("{}\n{}".format(name, "-"*len(name))) total_items = sum(map(len, status_dict.values())) # Sort by analysis value for status, item_set in sorted(status_dict.iteritems(), key=lambda key_value: key_value[0]): num_items = len(item_set) percent = (100.00 * num_items) / total_items print_stderr(" Status: {:<20} ({:>3}/{:<3}) ({:>6.2f}%)".format(status, num_items, total_items, percent)) if verbosity == 1: for item in sorted(item_set): print_stderr(" {}".format(item)) print_stderr("") else: # Verbosity is 2+, maximum verbosity output_template = "{}{:<30}{:>{rspace}}" for project_dict in projects_list: offset = 0 indent = " " * offset rspace = 80 - offset print_stderr(output_template.format(indent, "Project name:", project_dict['name'], rspace=rspace)) print_stderr(output_template.format(indent, "Project ID:", project_dict['id'], rspace=rspace)) print_stderr(output_template.format(indent, "Project status:", project_dict['status'], rspace=rspace)) for sample_dict in project_dict['samples']: print_stderr("") offset = 4 indent = " " * offset rspace = 80 - offset print_stderr(output_template.format(indent, "Sample ID:", sample_dict['id'], rspace=rspace)) print_stderr(output_template.format(indent, "Sample analysis status:", sample_dict['analysis_status'], rspace=rspace)) print_stderr(output_template.format(indent, "Sample coverage:", sample_dict['coverage'], rspace=rspace)) for libprep_dict in sample_dict['libpreps']: print_stderr("") offset = 8 indent = " " * offset rspace = 80 - offset print_stderr(output_template.format(indent, "Libprep ID:", libprep_dict['id'], rspace=rspace)) print_stderr(output_template.format(indent, "Libprep qc status:", libprep_dict['qc'], rspace=rspace)) for seqrun_dict in libprep_dict['seqruns']: print_stderr("") offset = 12 indent = " " * offset rspace = 80 - offset print_stderr(output_template.format(indent, "Seqrun ID:", seqrun_dict['id'], rspace=rspace)) print_stderr(output_template.format(indent, "Seqrun alignment status:", seqrun_dict['alignment_status'], rspace=rspace)) print_stderr(output_template.format(indent, "Seqrun mean auto. coverage:", seqrun_dict['coverage'], rspace=rspace)) if "total_reads" in seqrun_dict: print_stderr(output_template.format(indent, "Seqrun total reads:", seqrun_dict['total_reads'], rspace=rspace)) print_stderr("\n")