def write_to_charon_NGI_results(job_id, return_code, run_dir): """Update the status of a sequencing run after alignment. :param NGIProject project_id: The name of the project, sample, lib prep, flowcell id :param int return_code: The return code of the workflow process :param string run_dir: the directory where results are stored (I know that I am running piper) :raises RuntimeError: If the Charon database could not be updated """ charon_session = CharonSession() # Consider moving this mapping to the CharonSession object or something if return_code is None: status = "RUNNING" elif return_code == 0: status = "DONE" else: ## TODO we need to differentiate between COMPUTATION_FAILED and DATA_FAILED ## also there is IGNORE? status = "COMPUTATION_FAILED" try: m_dict = STHLM_UUSNP_SAMPLE_RE.match(job_id).groupdict() #m_dict = re.match(r'?P<project_name>\w\.\w+_\d+_\d+|\w{2}-\d+)_(?P<sample_id>[\w-]+)_(?P<libprep_id>\w|\w{2}\d{3}_\2)_(?P<seqrun_id>\d{6}_\w+_\d{4}_.{10})', job_id).groupdict() project_id = get_project_id_from_name(m_dict['project_name']) sample_id = m_dict['sample_id'] except (TypeError, AttributeError): error_msg = "Could not parse project/sample ids from job id \"{}\"; cannot update Charon with results!".format( job_id) raise RuntimeError(error_msg) try: charon_session.sample_update(project_id, sample_id, status=status) except CharonError as e: error_msg = ('Failed to update sample status to "{}" for sample "{}" ' 'in Charon database: {}'.format(status, project_id, sample_id, e)) raise RuntimeError(error_msg)
def write_to_charon_NGI_results(job_id, return_code, run_dir): """Update the status of a sequencing run after alignment. :param NGIProject project_id: The name of the project, sample, lib prep, flowcell id :param int return_code: The return code of the workflow process :param string run_dir: the directory where results are stored (I know that I am running piper) :raises RuntimeError: If the Charon database could not be updated """ charon_session = CharonSession() # Consider moving this mapping to the CharonSession object or something if return_code is None: status = "RUNNING" elif return_code == 0: status = "DONE" else: ## TODO we need to differentiate between COMPUTATION_FAILED and DATA_FAILED ## also there is IGNORE? status = "COMPUTATION_FAILED" try: m_dict = STHLM_UUSNP_SAMPLE_RE.match(job_id).groupdict() #m_dict = re.match(r'?P<project_name>\w\.\w+_\d+_\d+|\w{2}-\d+)_(?P<sample_id>[\w-]+)_(?P<libprep_id>\w|\w{2}\d{3}_\2)_(?P<seqrun_id>\d{6}_\w+_\d{4}_.{10})', job_id).groupdict() project_id = get_project_id_from_name(m_dict['project_name']) sample_id = m_dict['sample_id'] except (TypeError, AttributeError): error_msg = "Could not parse project/sample ids from job id \"{}\"; cannot update Charon with results!".format(job_id) raise RuntimeError(error_msg) try: charon_session.sample_update(project_id, sample_id, status=status) except CharonError as e: error_msg = ('Failed to update sample status to "{}" for sample "{}" ' 'in Charon database: {}'.format(status, project_id, sample_id, e)) raise RuntimeError(error_msg)
def setup_analysis_directory_structure(fc_dir, projects_to_analyze, restrict_to_projects=None, restrict_to_samples=None, create_files=True, fallback_libprep=None, quiet=False, config=None, config_file_path=None): """ Copy and sort files from their CASAVA-demultiplexed flowcell structure into their respective project/sample/libPrep/FCIDs. This collects samples split across multiple flowcells. :param str fc_dir: The directory created by CASAVA for this flowcell. :param dict config: The parsed configuration file. :param set projects_to_analyze: A dict (of Project objects, or empty) :param bool create_files: Alter the filesystem (as opposed to just parsing flowcells) (default True) :param str fallback_libprep: If libprep cannot be determined, use this value if supplied (default None) :param list restrict_to_projects: Specific projects within the flowcell to process exclusively :param list restrict_to_samples: Specific samples within the flowcell to process exclusively :returns: A list of NGIProject objects that need to be run through the analysis pipeline :rtype: list :raises KeyError: If a required configuration key is not available. """ LOG.info("Setting up analysis for demultiplexed data in source folder \"{}\"".format(fc_dir)) if not restrict_to_projects: restrict_to_projects = [] if not restrict_to_samples: restrict_to_samples = [] config["quiet"] = quiet # Hack because I enter here from a script sometimes pattern="(.+(?:{}|{}))\/.+".format(config["analysis"]["sthlm_root"], config["analysis"]["upps_root"]) matches=re.match(pattern, fc_dir) if matches: flowcell_root=matches.group(1) else: LOG.error("cannot guess which project the flowcell {} belongs to".format(fc_dir)) raise RuntimeError analysis_top_dir = os.path.abspath(os.path.join(flowcell_root,config["analysis"]["top_dir"])) try: safe_makedir(analysis_top_dir) except OSError as e: LOG.error('Error: Analysis top directory {} does not exist and could not ' 'be created.'.format(analysis_top_dir)) fc_dir = fc_dir if os.path.isabs(fc_dir) else os.path.join(analysis_top_dir, fc_dir) if not os.path.exists(fc_dir): LOG.error("Error: Flowcell directory {} does not exist".format(fc_dir)) return [] # Map the directory structure for this flowcell try: fc_dir_structure = parse_flowcell(fc_dir) except (OSError, ValueError) as e: LOG.error("Error when processing flowcell dir \"{}\": {}".format(fc_dir, e)) return [] fc_full_id = fc_dir_structure['fc_full_id'] if not fc_dir_structure.get('projects'): LOG.warn("No projects found in specified flowcell directory \"{}\"".format(fc_dir)) # Iterate over the projects in the flowcell directory for project in fc_dir_structure.get('projects', []): project_name = project['project_name'] project_original_name = project['project_original_name'] samplesheet_path = fc_dir_structure.get("samplesheet_path") try: # Maps e.g. "Y.Mom_14_01" to "P123" project_id = get_project_id_from_name(project_name) except (CharonError, RuntimeError, ValueError) as e: LOG.warn('Could not retrieve project id from Charon (record missing?). ' 'Using project name ("{}") as project id ' '(error: {})'.format(project_name, e)) project_id = project_name # If specific projects are specified, skip those that do not match if restrict_to_projects and project_name not in restrict_to_projects and \ project_id not in restrict_to_projects: LOG.debug("Skipping project {} (not in restrict_to_projects)".format(project_name)) continue LOG.info("Setting up project {}".format(project.get("project_name"))) # Create a project directory if it doesn't already exist, including # intervening "DATA" directory project_dir = os.path.join(analysis_top_dir, "DATA", project_id) project_sl_dir = os.path.join(analysis_top_dir, "DATA", project_name) project_analysis_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_id) project_analysis_sl_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_name) if create_files: safe_makedir(project_dir, 0o2770) safe_makedir(project_analysis_dir, 0o2770) if not project_dir == project_sl_dir and \ not os.path.exists(project_sl_dir): os.symlink(project_dir, project_sl_dir) if not project_analysis_dir == project_analysis_sl_dir and \ not os.path.exists(project_analysis_sl_dir): os.symlink(project_analysis_dir, project_analysis_sl_dir) try: project_obj = projects_to_analyze[project_dir] except KeyError: project_obj = NGIProject(name=project_name, dirname=project_id, project_id=project_id, base_path=analysis_top_dir) projects_to_analyze[project_dir] = project_obj # Iterate over the samples in the project for sample in project.get('samples', []): sample_name = sample['sample_name'] # If specific samples are specified, skip those that do not match if restrict_to_samples and sample_name not in restrict_to_samples: LOG.debug("Skipping sample {}: not in specified samples " "{}".format(sample_name, ", ".join(restrict_to_samples))) continue LOG.info("Setting up sample {}".format(sample_name)) # Create a directory for the sample if it doesn't already exist sample_dir = os.path.join(project_dir, sample_name) if create_files: safe_makedir(sample_dir, 0o2770) # This will only create a new sample object if it doesn't already exist in the project sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name) # Get the Library Prep ID for each file pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$") fastq_files = filter(pattern.match, sample.get('files', [])) # For each fastq file, create the libprep and seqrun objects # and add the fastq file to the seqprep object # Note again that these objects only get created if they don't yet exist; # if they do exist, the existing object is returned for fq_file in fastq_files: # Try to parse from SampleSheet try: if not samplesheet_path: raise ValueError() lane_num = re.match(r'[\w-]+_L\d{2}(\d)_\w+', fq_file).groups()[0] libprep_name = determine_library_prep_from_samplesheet(samplesheet_path, project_original_name, sample_name, lane_num) except (IndexError, ValueError) as e: LOG.debug('Unable to determine library prep from sample sheet file ' '("{}"); try to determine from Charon'.format(e)) try: # Requires Charon access libprep_name = determine_library_prep_from_fcid(project_id, sample_name, fc_full_id) LOG.debug('Found libprep name "{}" in Charon'.format(libprep_name)) except ValueError: charon_session = CharonSession() libpreps = charon_session.sample_get_libpreps(project_id, sample_name).get('libpreps') if len(libpreps) == 1: libprep_name = libpreps[0].get('libprepid') LOG.warn('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but only one ' 'library prep is present in Charon ("{}"). Using ' 'this as the library prep.'.format(project_name, sample_name, fc_full_id, fq_file, libprep_name)) elif fallback_libprep: libprep_name = fallback_libprep LOG.warn('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but a fallback ' 'libprep value of "{}" was supplied -- using this ' 'value.'.format(project_name, sample_name, fc_full_id, fq_file, libprep_name, fallback_libprep)) else: error_text = ('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon. Skipping ' 'analysis.'.format(project_name, sample_name, fc_full_id, fq_file)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue libprep_object = sample_obj.add_libprep(name=libprep_name, dirname=libprep_name) libprep_dir = os.path.join(sample_dir, libprep_name) if create_files: safe_makedir(libprep_dir, 0o2770) seqrun_object = libprep_object.add_seqrun(name=fc_full_id, dirname=fc_full_id) seqrun_dir = os.path.join(libprep_dir, fc_full_id) if create_files: safe_makedir(seqrun_dir, 0o2770) seqrun_object.add_fastq_files(fq_file) if fastq_files and create_files: src_sample_dir = os.path.join(fc_dir_structure['fc_dir'], project['data_dir'], project['project_dir'], sample['sample_dir']) for libprep_obj in sample_obj: for seqrun_obj in libprep_obj: src_fastq_files = [os.path.join(src_sample_dir, fastq_file) for fastq_file in seqrun_obj.fastq_files] seqrun_dst_dir = os.path.join(project_obj.base_path, project_obj.dirname, sample_obj.dirname, libprep_obj.dirname, seqrun_obj.dirname) LOG.info("Symlinking fastq files from {} to {}...".format(src_sample_dir, seqrun_dir)) try: do_symlink(src_fastq_files, seqrun_dir) except OSError: error_text = ('Could not symlink files for project/sample' 'libprep/seqrun {}/{}/{}/{}'.format(project_obj, sample_obj, libprep_obj, seqrun_obj)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue return projects_to_analyze
def setup_analysis_directory_structure(fc_dir, projects_to_analyze, restrict_to_projects=None, restrict_to_samples=None, create_files=True, fallback_libprep=None, quiet=False, config=None, config_file_path=None): """ Copy and sort files from their CASAVA-demultiplexed flowcell structure into their respective project/sample/libPrep/FCIDs. This collects samples split across multiple flowcells. :param str fc_dir: The directory created by CASAVA for this flowcell. :param dict config: The parsed configuration file. :param set projects_to_analyze: A dict (of Project objects, or empty) :param bool create_files: Alter the filesystem (as opposed to just parsing flowcells) (default True) :param str fallback_libprep: If libprep cannot be determined, use this value if supplied (default None) :param list restrict_to_projects: Specific projects within the flowcell to process exclusively :param list restrict_to_samples: Specific samples within the flowcell to process exclusively :returns: A list of NGIProject objects that need to be run through the analysis pipeline :rtype: list :raises KeyError: If a required configuration key is not available. """ LOG.info( "Setting up analysis for demultiplexed data in source folder \"{}\"". format(fc_dir)) if not restrict_to_projects: restrict_to_projects = [] if not restrict_to_samples: restrict_to_samples = [] config[ "quiet"] = quiet # Hack because I enter here from a script sometimes #Checks flowcell path to establish which group owns it pattern = ".+({}|{})\/.+".format(config["analysis"]["sthlm_root"], config["analysis"]["upps_root"]) matches = re.match(pattern, fc_dir) if matches: flowcell_uppnexid = matches.group(1) else: LOG.error( "cannot guess which project (sthlm/uppsala) the flowcell {} belongs to" .format(fc_dir)) raise RuntimeError analysis_top_dir = os.path.abspath( os.path.join(config["analysis"]["base_root"], flowcell_uppnexid, config["analysis"]["top_dir"])) try: safe_makedir(analysis_top_dir) except OSError as e: LOG.error( 'Error: Analysis top directory {} does not exist and could not ' 'be created.'.format(analysis_top_dir)) fc_dir = fc_dir if os.path.isabs(fc_dir) else os.path.join( analysis_top_dir, fc_dir) if not os.path.exists(fc_dir): LOG.error("Error: Flowcell directory {} does not exist".format(fc_dir)) return [] # Map the directory structure for this flowcell try: fc_dir_structure = parse_flowcell(fc_dir) except (OSError, ValueError) as e: LOG.error("Error when processing flowcell dir \"{}\": {}".format( fc_dir, e)) return [] fc_full_id = fc_dir_structure['fc_full_id'] if not fc_dir_structure.get('projects'): LOG.warning( "No projects found in specified flowcell directory \"{}\"".format( fc_dir)) # Iterate over the projects in the flowcell directory for project in fc_dir_structure.get('projects', []): project_name = project['project_name'] project_original_name = project['project_original_name'] samplesheet_path = fc_dir_structure.get("samplesheet_path") # parse the samplesheet and get the expected sample numbers assigned by bcl2fastq samplesheet_sample_numbers = get_sample_numbers_from_samplesheet( samplesheet_path) if samplesheet_path else None try: # Maps e.g. "Y.Mom_14_01" to "P123" project_id = get_project_id_from_name(project_name) except (CharonError, RuntimeError, ValueError) as e: LOG.warning( 'Could not retrieve project id from Charon (record missing?). ' 'Using project name ("{}") as project id ' '(error: {})'.format(project_name, e)) project_id = project_name # If specific projects are specified, skip those that do not match if restrict_to_projects and project_name not in restrict_to_projects and \ project_id not in restrict_to_projects: LOG.debug( "Skipping project {} (not in restrict_to_projects)".format( project_name)) continue LOG.info("Setting up project {}".format(project.get("project_name"))) # Create a project directory if it doesn't already exist, including # intervening "DATA" directory project_dir = os.path.join(analysis_top_dir, "DATA", project_id) project_sl_dir = os.path.join(analysis_top_dir, "DATA", project_name) project_analysis_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_id) project_analysis_sl_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_name) if create_files: safe_makedir(project_dir, 0o2770) safe_makedir(project_analysis_dir, 0o2770) if not project_dir == project_sl_dir and \ not os.path.exists(project_sl_dir): os.symlink(project_dir, project_sl_dir) if not project_analysis_dir == project_analysis_sl_dir and \ not os.path.exists(project_analysis_sl_dir): os.symlink(project_analysis_dir, project_analysis_sl_dir) try: project_obj = projects_to_analyze[project_dir] except KeyError: project_obj = NGIProject(name=project_name, dirname=project_id, project_id=project_id, base_path=analysis_top_dir) projects_to_analyze[project_dir] = project_obj # Iterate over the samples in the project for sample in project.get('samples', []): sample_name = sample['sample_name'] # If specific samples are specified, skip those that do not match if restrict_to_samples and sample_name not in restrict_to_samples: LOG.debug("Skipping sample {}: not in specified samples " "{}".format(sample_name, ", ".join(restrict_to_samples))) continue LOG.info("Setting up sample {}".format(sample_name)) # Create a directory for the sample if it doesn't already exist sample_dir = os.path.join(project_dir, sample_name) if create_files: safe_makedir(sample_dir, 0o2770) # This will only create a new sample object if it doesn't already exist in the project sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name) # Get the Library Prep ID for each file pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$") fastq_files = list(filter(pattern.match, sample.get('files', []))) # For each fastq file, create the libprep and seqrun objects # and add the fastq file to the seqprep object # Note again that these objects only get created if they don't yet exist; # if they do exist, the existing object is returned for fq_file in fastq_files: # Try to use assignment from SampleSheet samplesheet_sample = match_fastq_sample_number_to_samplesheet( fq_file, samplesheet_sample_numbers, project_id) if samplesheet_sample is not None and \ samplesheet_sample[6] is not None: libprep_name = samplesheet_sample[6] else: LOG.debug( 'Unable to determine library prep from sample sheet file; try to determine from Charon' ) try: # Requires Charon access libprep_name = determine_library_prep_from_fcid( project_id, sample_name, fc_full_id) LOG.debug('Found libprep name "{}" in Charon'.format( libprep_name)) except ValueError: charon_session = CharonSession() libpreps = charon_session.sample_get_libpreps( project_id, sample_name).get('libpreps') if len(libpreps) == 1: libprep_name = libpreps[0].get('libprepid') LOG.warning( 'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but only one ' 'library prep is present in Charon ("{}"). Using ' 'this as the library prep.'.format( project_name, sample_name, fc_full_id, fq_file, libprep_name)) elif fallback_libprep: libprep_name = fallback_libprep LOG.warning( 'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but a fallback ' 'libprep value of "{}" was supplied -- using this ' 'value.'.format(project_name, sample_name, fc_full_id, fq_file, libprep_name)) else: error_text = ( 'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon. Skipping ' 'analysis.'.format(project_name, sample_name, fc_full_id, fq_file)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue libprep_object = sample_obj.add_libprep(name=libprep_name, dirname=libprep_name) libprep_dir = os.path.join(sample_dir, libprep_name) if create_files: safe_makedir(libprep_dir, 0o2770) seqrun_object = libprep_object.add_seqrun(name=fc_full_id, dirname=fc_full_id) seqrun_dir = os.path.join(libprep_dir, fc_full_id) if create_files: safe_makedir(seqrun_dir, 0o2770) seqrun_object.add_fastq_files(fq_file) if fastq_files and create_files: src_sample_dir = os.path.join(fc_dir_structure['fc_dir'], project['data_dir'], project['project_dir'], sample['sample_dir']) for libprep_obj in sample_obj: for seqrun_obj in libprep_obj: src_fastq_files = [ os.path.join(src_sample_dir, fastq_file) for fastq_file in seqrun_obj.fastq_files ] seqrun_dst_dir = os.path.join(project_obj.base_path, "DATA", project_obj.dirname, sample_obj.dirname, libprep_obj.dirname, seqrun_obj.dirname) LOG.info( "Symlinking fastq files from {} to {}...".format( src_sample_dir, seqrun_dst_dir)) try: do_symlink(src_fastq_files, seqrun_dst_dir) except OSError: error_text = ( 'Could not symlink files for project/sample' 'libprep/seqrun {}/{}/{}/{}'.format( project_obj, sample_obj, libprep_obj, seqrun_obj)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue return projects_to_analyze
def test_get_project_id_from_name(self): # Check that it matches self.assertEqual(self.project_id, get_project_id_from_name(self.project_name))
def test_get_project_id_from_name_missing_id(self, mock_get): """Raise ValueError if 'projectid' is missing""" mock_get.return_value = {} with self.assertRaises(ValueError): get_project_id_from_name(self.project_name)
def test_get_project_id_from_name_missing_proj(self, mock_get): """Raise ValueError if project is missing""" mock_get.side_effect = CharonError('Error', status_code=404) with self.assertRaises(ValueError): get_project_id_from_name(self.project_name)
def test_get_project_id_from_name(self, mock_get): """Return project ID given the project name""" mock_get.return_value = {'projectid': 'P100000'} self.assertEqual(self.project_id, get_project_id_from_name(self.project_name))
def main(args): originalProject = {} originalProject["fc_dir"] = "/proj/a2010002/INBOX/140702_D00415_0052_AC41A2ANXX/" originalProject["fc_name"] = "140702_D00415_0052_AC41A2ANXX" originalProject["fc_id"] = "C41A2ANXX" originalProject["project_name"] = "M.Kaller_14_06" originalProject["project_name_ill"] = "M__Kaller_14_06" originalProject["project_id"] = "P1171" originalProject["samples_id"] = ["102", "104", "106", "108"] ##create random rnd_fc_id_noplate = id_generator_digits_chars(9) rnd_fc_id = "A{}".format(rnd_fc_id_noplate) rnd_instrument = id_generator_digits_chars(6) rnd_date = id_generator_digits(6) rnd_fc_name = "{}_{}_{}_{}".format(rnd_date, rnd_instrument, id_generator_digits(4), rnd_fc_id) rnd_fc_path = os.path.join(data_folder, rnd_fc_name) if os.path.isdir(rnd_fc_path): print "flowcell name already exists: bad luck!!!! Abort" return 1 rnd_project_name = args.rnd_project_name if args.rnd_project_name is "": print "error project-name must be specified (something like M.Kaller_14_06)" return 1 charon_session = CharonSession() rndProject = {} try: rnd_project_id = get_project_id_from_name(rnd_project_name) rndProject["project_id"] = rnd_project_id rndProject["project_name"] = rnd_project_name except (RuntimeError, ValueError) as e: print " project does not exits on Charon, creating it" rnd_project_id = "P{}".format(id_generator_digits(4)) rndProject["project_id"] = rnd_project_id rndProject["project_name"] = rnd_project_name base_url = charon_session.construct_charon_url("project") project_dict = { "projectid": rndProject["project_id"], "name": rndProject["project_name"], "status": "SEQUENCED", "pipeline": "NGI", "best_practice_analysis": "IGN", "sequencing_facility": "NGI-S", } # create the project on charon charon_session.post(base_url, json.dumps(project_dict)) rndProject["fc_dir"] = rnd_fc_path rndProject["fc_name"] = rnd_fc_name rndProject["fc_id"] = rnd_fc_id rndProject["project_name_ill"] = rnd_project_name.replace(".", "__") rndProject["samples_id"] = [ "{}".format(id_generator_digits(3)), "{}".format(id_generator_digits(3)), "{}".format(id_generator_digits(3)), "{}".format(id_generator_digits(3)), ] if args.restrict_to_sample is not "": originalProject["samples_id"] = [args.restrict_to_sample] rndProject["samples_id"] = ["{}".format(id_generator_digits(3))] # TODO: check that this project does not already exists on charon os.mkdir(rnd_fc_path) # parse SampleSheet_16bp.csv parse_sample_sheet("SampleSheet_16bp.csv", originalProject, rndProject) # parse SampleSheet.csv parse_sample_sheet("SampleSheet.csv", originalProject, rndProject) createDir(rndProject["fc_dir"], "Data") createDir(rndProject["fc_dir"], "InterOp") # Unaligned createDir(rndProject["fc_dir"], "Unaligned") Unaligned_dir = os.path.join(rndProject["fc_dir"], "Unaligned") BaseCall_stats_dir = "Basecall_Stats_{}".format(rndProject["fc_id"]) createDir(Unaligned_dir, BaseCall_stats_dir) # I do not need to copy the file... I hope as it is madness parse them # Unaligned_16bp createDir(rndProject["fc_dir"], "Unaligned_16bp") Unaligned_path = os.path.join(rndProject["fc_dir"], "Unaligned_16bp") BaseCall_stats_dir = "Basecall_Stats_{}".format(rndProject["fc_id"]) createDir(Unaligned_path, BaseCall_stats_dir) Project_dir = "Project_{}".format(rndProject["project_name_ill"]) createDir(Unaligned_path, Project_dir) # need to create samples now Project_path = os.path.join(Unaligned_path, Project_dir) rndSamplePos = 0 for originalSample in originalProject["samples_id"]: rndSample = rndProject["samples_id"][rndSamplePos] sample_dir = "Sample_{}_{}".format(rndProject["project_id"], rndSample) createDir(Project_path, sample_dir) Sample_path = os.path.join(Project_path, sample_dir) # now hard link or sub-samples fastq files originalProject_dir = "Project_{}".format(originalProject["project_name_ill"]) originalSampleDir = "Sample_{}_{}".format(originalProject["project_id"], originalSample) originalSamplePath = os.path.join( originalProject["fc_dir"], "Unaligned_16bp", originalProject_dir, originalSampleDir ) pairs_to_extract_per_lane = 0 ##create new sample sample_url = charon_session.construct_charon_url("sample", rndProject["project_id"]) sample_dict = { "sampleid": "{}_{}".format(rndProject["project_id"], rndSample), "status": "NEW", "received": "2014-04-17", "qc_status": "NEW", "genotyping_status": None, "genotyping_concordance": None, "lims_initial_qc": "Passed", "total_autosomal_coverage": 0, "total_sequenced_reads": 0, } charon_session.post(sample_url, json.dumps(sample_dict)) # create new library prep libprep_url = charon_session.construct_charon_url( "libprep", rndProject["project_id"], "{}_{}".format(rndProject["project_id"], rndSample) ) libprep_dict = {"libprepid": "A", "limsid": "24-44506", "status": "NEW"} charon_session.post(libprep_url, json.dumps(libprep_dict)) # create seq run seqrun_url = charon_session.construct_charon_url( "seqrun", rndProject["project_id"], "{}_{}".format(rndProject["project_id"], rndSample), "A" ) seqrun_dict = { "seqrunid": rnd_fc_name, "sequencing_status": "DONE", # 'mean_autosomal_coverage' : 0 } charon_session.post(seqrun_url, json.dumps(seqrun_dict)) if args.sample_cov > 0: # I know that I have 8 lanes reads_to_extract = (args.sample_cov * 3200000000) / 125 pairs_to_extract = reads_to_extract / 2 pairs_to_extract_per_lane = pairs_to_extract / 8 for fastq in [ fastq for fastq in listdir(originalSamplePath) if isfile(join(originalSamplePath, fastq)) and fastq.endswith("fastq.gz") ]: originalFastq = os.path.join(originalSamplePath, fastq) rndFastqName = fastq.replace( "{}_{}".format(originalProject["project_id"], originalSample), "{}_{}".format(rndProject["project_id"], rndSample), ) rndFastq = os.path.join(Sample_path, rndFastqName) if args.sample_cov == 0: os.link(originalFastq, rndFastq) else: downsample(originalFastq, rndFastq, pairs_to_extract_per_lane) rndSamplePos += 1 createDir(Unaligned_dir, "Temp") # I try to not consider these guys here createDir(Unaligned_dir, "Undetermined_indices") # I try to not consider these guys here produceRunInfo(rndProject["fc_dir"], rnd_fc_name, rnd_fc_id_noplate, rnd_instrument, rnd_date) os.link( "/proj/a2010002/INBOX/140702_D00415_0052_AC41A2ANXX/runParameters.xml", os.path.join(rnd_fc_path, "runParameters.xml"), )
from ngi_pipeline.database.communicate import get_project_id_from_name if __name__=="__main__": parser = argparse.ArgumentParser() parser.add_argument("-p", "--project", required=True) parser.add_argument("-s", "--sample", required=True) parser.add_argument("-c", "--coverage", type=int, required=True, dest="required_coverage") args = parser.parse_args() project = args.project sample = args.sample required_coverage = args.required_coverage charon_session = CharonSession() try: reported_coverage = charon_session.sample_get(project, sample).get("total_autosomal_coverage") except CharonError as e: try: project = get_project_id_from_name(project) except (CharonError, RuntimeError, ValueError) as e: print(('ERROR: Could not determine coverage for project {} / sample ' '{}: {}'.format(project, sample, e)), file=sys.stderr) reported_coverage = 0 else: reported_coverage = charon_session.sample_get(project, sample).get("total_autosomal_coverage") if int(reported_coverage) >= int(required_coverage): sys.exit(0) else: sys.exit(1)
type=int, required=True, dest="required_coverage") args = parser.parse_args() project = args.project sample = args.sample required_coverage = args.required_coverage charon_session = CharonSession() try: reported_coverage = charon_session.sample_get( project, sample).get("total_autosomal_coverage") except CharonError as e: try: project = get_project_id_from_name(project) except (CharonError, RuntimeError, ValueError) as e: print( ('ERROR: Could not determine coverage for project {} / sample ' '{}: {}'.format(project, sample, e)), file=sys.stderr) reported_coverage = 0 else: reported_coverage = charon_session.sample_get( project, sample).get("total_autosomal_coverage") if int(reported_coverage) >= int(required_coverage): sys.exit(0) else: sys.exit(1)
def main(args): originalProject = {} originalProject["fc_dir"] = "/proj/a2010002/INBOX/140702_D00415_0052_AC41A2ANXX/" originalProject["fc_name"] = "140702_D00415_0052_AC41A2ANXX" originalProject["fc_id"] = "C41A2ANXX" originalProject["project_name"] = "M.Kaller_14_06" originalProject["project_name_ill"] = "M__Kaller_14_06" originalProject["project_id"] = "P1171" originalProject["samples_id"] = ["102", "104" , "106", "108"] ##create random rnd_fc_id_noplate = id_generator_digits_chars(9) rnd_fc_id = "A{}".format(rnd_fc_id_noplate) rnd_instrument = id_generator_digits_chars(6) rnd_date = id_generator_digits(6) rnd_fc_name = "{}_{}_{}_{}".format(rnd_date, rnd_instrument, id_generator_digits(4), rnd_fc_id) rnd_fc_path = os.path.join(data_folder, rnd_fc_name) if os.path.isdir(rnd_fc_path): print "flowcell name already exists: bad luck!!!! Abort" return 1 rnd_project_name = args.rnd_project_name if args.rnd_project_name is "": print "error project-name must be specified (something like M.Kaller_14_06)" return 1 charon_session = CharonSession() rndProject = {} try: rnd_project_id = get_project_id_from_name(rnd_project_name) rndProject["project_id"] = rnd_project_id rndProject["project_name"] = rnd_project_name except (RuntimeError, ValueError) as e: print " project does not exits on Charon, creating it" rnd_project_id = "P{}".format(id_generator_digits(4)) rndProject["project_id"] = rnd_project_id rndProject["project_name"] = rnd_project_name base_url = charon_session.construct_charon_url("project") project_dict = {'projectid': rndProject["project_id"], 'name': rndProject["project_name"], 'status':'SEQUENCED', 'pipeline':'NGI', 'best_practice_analysis':'IGN', 'sequencing_facility':'NGI-S' } #create the project on charon charon_session.post(base_url, json.dumps(project_dict)) rndProject["fc_dir"] = rnd_fc_path rndProject["fc_name"] = rnd_fc_name rndProject["fc_id"] = rnd_fc_id rndProject["project_name_ill"] = rnd_project_name.replace(".", "__"); rndProject["samples_id"] = ["{}".format(id_generator_digits(3)), "{}".format(id_generator_digits(3)), "{}".format(id_generator_digits(3)), "{}".format(id_generator_digits(3))] if args.restrict_to_sample is not "": originalProject["samples_id"] = [args.restrict_to_sample] rndProject["samples_id"] = ["{}".format(id_generator_digits(3))] #TODO: check that this project does not already exists on charon os.mkdir(rnd_fc_path) #parse SampleSheet_16bp.csv parse_sample_sheet("SampleSheet_16bp.csv", originalProject, rndProject) #parse SampleSheet.csv parse_sample_sheet("SampleSheet.csv", originalProject, rndProject) createDir(rndProject["fc_dir"], "Data") createDir(rndProject["fc_dir"], "InterOp") #Unaligned createDir(rndProject["fc_dir"], "Unaligned") Unaligned_dir = os.path.join(rndProject["fc_dir"], "Unaligned") BaseCall_stats_dir = "Basecall_Stats_{}".format(rndProject["fc_id"]) createDir(Unaligned_dir, BaseCall_stats_dir) #I do not need to copy the file... I hope as it is madness parse them #Unaligned_16bp createDir(rndProject["fc_dir"], "Unaligned_16bp") Unaligned_path = os.path.join(rndProject["fc_dir"], "Unaligned_16bp") BaseCall_stats_dir = "Basecall_Stats_{}".format(rndProject["fc_id"]) createDir(Unaligned_path, BaseCall_stats_dir) Project_dir = "Project_{}".format(rndProject["project_name_ill"]) createDir(Unaligned_path, Project_dir) #need to create samples now Project_path = os.path.join( Unaligned_path, Project_dir) rndSamplePos = 0; for originalSample in originalProject["samples_id"]: rndSample = rndProject["samples_id"][rndSamplePos] sample_dir = "Sample_{}_{}".format(rndProject["project_id"], rndSample) createDir(Project_path, sample_dir) Sample_path = os.path.join( Project_path, sample_dir) #now hard link or sub-samples fastq files originalProject_dir = "Project_{}".format(originalProject["project_name_ill"]) originalSampleDir = "Sample_{}_{}".format(originalProject["project_id"], originalSample) originalSamplePath = os.path.join(originalProject["fc_dir"] , "Unaligned_16bp", originalProject_dir, originalSampleDir) pairs_to_extract_per_lane = 0 ##create new sample sample_url = charon_session.construct_charon_url("sample", rndProject["project_id"]) sample_dict = {'sampleid': "{}_{}".format(rndProject["project_id"], rndSample), 'status':'NEW', 'received':'2014-04-17', 'qc_status': 'NEW', 'genotyping_status': None, 'genotyping_concordance': None, 'lims_initial_qc': 'Passed', 'total_autosomal_coverage': 0, 'total_sequenced_reads': 0 } charon_session.post(sample_url, json.dumps(sample_dict)) #create new library prep libprep_url = charon_session.construct_charon_url("libprep", rndProject["project_id"], "{}_{}".format(rndProject["project_id"], rndSample)) libprep_dict = {'libprepid': "A", 'limsid':'24-44506', 'status':'NEW' } charon_session.post(libprep_url, json.dumps(libprep_dict)) #create seq run seqrun_url = charon_session.construct_charon_url("seqrun", rndProject["project_id"], "{}_{}".format(rndProject["project_id"], rndSample), "A") seqrun_dict = {'seqrunid': rnd_fc_name , 'sequencing_status':'DONE' , # 'mean_autosomal_coverage' : 0 } charon_session.post(seqrun_url, json.dumps(seqrun_dict)) if args.sample_cov > 0: #I know that I have 8 lanes reads_to_extract = (args.sample_cov* 3200000000)/125 pairs_to_extract = reads_to_extract/2 pairs_to_extract_per_lane = pairs_to_extract/8 for fastq in [fastq for fastq in listdir(originalSamplePath) if isfile(join(originalSamplePath,fastq)) and fastq.endswith("fastq.gz")]: originalFastq = os.path.join(originalSamplePath, fastq) rndFastqName = fastq.replace("{}_{}".format(originalProject["project_id"], originalSample), "{}_{}".format(rndProject["project_id"], rndSample)) rndFastq = os.path.join(Sample_path , rndFastqName) if args.sample_cov == 0: os.link(originalFastq, rndFastq) else: downsample(originalFastq, rndFastq, pairs_to_extract_per_lane) rndSamplePos += 1 createDir(Unaligned_dir, "Temp") # I try to not consider these guys here createDir(Unaligned_dir, "Undetermined_indices") # I try to not consider these guys here produceRunInfo(rndProject["fc_dir"], rnd_fc_name, rnd_fc_id_noplate, rnd_instrument, rnd_date) os.link("/proj/a2010002/INBOX/140702_D00415_0052_AC41A2ANXX/runParameters.xml", os.path.join(rnd_fc_path, "runParameters.xml"))
def test_get_project_id_from_name(self): # Check that it matches self.assertEqual(self.project_id, get_project_id_from_name(self.project_name))
def recreate_project_from_filesystem(project_dir, restrict_to_samples=None, restrict_to_libpreps=None, restrict_to_seqruns=None): """Recreates the full project/sample/libprep/seqrun set of NGIObjects using the directory tree structure.""" if not restrict_to_samples: restrict_to_samples = [] if not restrict_to_libpreps: restrict_to_libpreps = [] if not restrict_to_seqruns: restrict_to_seqruns = [] base_path, project_name = os.path.split(project_dir) if not project_name: base_path, project_name = os.path.split(base_path) LOG.info('Setting up project "{}"'.format(project_name)) try: # This requires Charon access -- maps e.g. "Y.Mom_14_01" to "P123" project_id = get_project_id_from_name(project_name) # Should handle requests.exceptions.Timeout in Charon classes except (CharonError, ValueError, Timeout) as e: error_msg = ('Cannot proceed with project "{}" due to ' 'Charon-related error: {}'.format(project_name, e)) raise CharonError(error_msg) project_obj = NGIProject(name=project_name, dirname=project_name, project_id=project_id, base_path=base_path) samples_pattern = os.path.join(project_dir, "*") samples = filter(os.path.isdir, glob.glob(samples_pattern)) if not samples: LOG.warn('No samples found for project "{}"'.format(project_obj)) for sample_dir in samples: sample_name = os.path.basename(sample_dir) if restrict_to_samples and sample_name not in restrict_to_samples: LOG.debug('Skipping sample "{}": not in specified samples "{}"'.format(sample_name, ', '.join(restrict_to_samples))) continue LOG.info('Setting up sample "{}"'.format(sample_name)) sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name) libpreps_pattern = os.path.join(sample_dir, "*") libpreps = filter(os.path.isdir, glob.glob(libpreps_pattern)) if not libpreps: LOG.warn('No libpreps found for sample "{}"'.format(sample_obj)) for libprep_dir in libpreps: libprep_name = os.path.basename(libprep_dir) if restrict_to_libpreps and libprep_name not in restrict_to_libpreps: LOG.debug('Skipping libprep "{}": not in specified libpreps "{}"'.format(libprep_name, ', '.join(restrict_to_libpreps))) continue LOG.info('Setting up libprep "{}"'.format(libprep_name)) libprep_obj = sample_obj.add_libprep(name=libprep_name, dirname=libprep_name) seqruns_pattern = os.path.join(libprep_dir, "*_*_*_*") seqruns = filter(os.path.isdir, glob.glob(seqruns_pattern)) if not seqruns: LOG.warn('No seqruns found for libprep "{}"'.format(libprep_obj)) for seqrun_dir in seqruns: seqrun_name = os.path.basename(seqrun_dir) if restrict_to_seqruns and seqrun_name not in restrict_to_seqruns: LOG.debug('Skipping seqrun "{}": not in specified seqruns "{}"'.format(seqrun_name, ', '.join(restrict_to_seqruns))) continue LOG.info('Setting up seqrun "{}"'.format(seqrun_name)) seqrun_obj = libprep_obj.add_seqrun(name=seqrun_name, dirname=seqrun_name) pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$") all_files = glob.glob(os.path.join(seqrun_dir, "*")) fastq_files = filter(os.path.isfile, filter(pattern.match, all_files)) for fq_file in fastq_files: fq_name = os.path.basename(fq_file) LOG.info('Adding fastq file "{}" to seqrun "{}"'.format(fq_name, seqrun_obj)) seqrun_obj.add_fastq_files([fq_name]) return project_obj