def archive_to_swestore(seconds, run=None, max_runs=None, force=False, compress_only=False): """Send runs (as archives) in NAS nosync to swestore for backup :param int seconds: Days/hours converted as seconds to check :param str run: specific run to send swestore :param int max_runs: number of runs to be processed simultaneously :param bool force: Force the archiving even if the run is not complete :param bool compress_only: Compress the run without sending it to swestore """ # If the run is specified in the command line, check that exists and archive if run: run = os.path.basename(run) base_dir = os.path.dirname(run) if re.match(filesystem.RUN_RE, run): # If the parameter is not an absolute path, find the run in the archive_dirs if not base_dir: for archive_dir in CONFIG.get('storage').get('archive_dirs'): if os.path.exists(os.path.join(archive_dir, run)): base_dir = archive_dir if not os.path.exists(os.path.join(base_dir, run)): logger.error(("Run {} not found. Please make sure to specify " "the absolute path or relative path being in " "the correct directory.".format(run))) else: with filesystem.chdir(base_dir): _archive_run((run, seconds, force, compress_only)) else: logger.error( "The name {} doesn't look like an Illumina run".format( os.path.basename(run))) # Otherwise find all runs in every data dir on the nosync partition else: logger.info("Archiving old runs to SWESTORE") for to_send_dir in CONFIG.get('storage').get('archive_dirs'): logger.info('Checking {} directory'.format(to_send_dir)) with filesystem.chdir(to_send_dir): to_be_archived = [ r for r in os.listdir(to_send_dir) if re.match(filesystem.RUN_RE, r) and not os.path.exists("{}.archiving".format(r.split('.')[0])) ] if to_be_archived: pool = Pool(processes=len(to_be_archived ) if not max_runs else max_runs) pool.map_async(_archive_run, ((run, seconds, force, compress_only) for run in to_be_archived)) pool.close() pool.join() else: logger.info('No old runs to be archived')
def demultiplex_run(self): """ Demultiplex a Xten run: - find the samplesheet - make a local copy of the samplesheet and name it SampleSheet.csv - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep) - run bcl2fastq conversion """ #we have 10x lane - need to split the samples sheet and build a 10x command for bcl2fastq Complex_run = False if len(self.lanes_10X) and len(self.lanes_not_10X): Complex_run = True if Complex_run: with chdir(self.run_dir): samplesheet_dest_not_10X="SampleSheet_0.csv" with open(samplesheet_dest_not_10X, 'wb') as fcd: fcd.write(_generate_samplesheet_subset(self.runParserObj.samplesheet, self.lanes_not_10X)) samplesheet_dest_10X="SampleSheet_1.csv" with open(samplesheet_dest_10X, 'wb') as fcd: fcd.write(_generate_samplesheet_subset(self.runParserObj.samplesheet, self.lanes_10X)) else: with chdir(self.run_dir): samplesheet_dest="SampleSheet_0.csv" with open(samplesheet_dest, 'wb') as fcd: fcd.write(_generate_samplesheet_subset(self.runParserObj.samplesheet, (self.lanes_10X or self.lanes_not_10X))) per_lane_base_masks = self._generate_per_lane_base_mask() max_different_base_masks = max([len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks]) if max_different_base_masks > 1: # in a HiSeqX run I cannot have different index sizes in the SAME lane logger.error("In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \ in the same lane".format(self.id)) return False bcl2fastq_cmd_counter = 0 with chdir(self.run_dir): # create Demultiplexing dir, this changes the status to IN_PROGRESS if not os.path.exists("Demultiplexing"): os.makedirs("Demultiplexing") with chdir(self.run_dir): if self.lanes_not_10X: cmd_normal = self.generate_bcl_command(self.lanes_not_10X, bcl2fastq_cmd_counter) misc.call_external_command_detached(cmd_normal, with_log_files = True, prefix="demux_{}".format(bcl2fastq_cmd_counter)) logger.info(("BCL to FASTQ conversion and demultiplexing started for " "normal run {} on {}".format(os.path.basename(self.id), datetime.now()))) bcl2fastq_cmd_counter += 1 if self.lanes_10X: cmd_10X = self.generate_bcl_command(self.lanes_10X, bcl2fastq_cmd_counter, is_10X = True) misc.call_external_command_detached(cmd_10X, with_log_files = True, prefix="demux_{}".format(bcl2fastq_cmd_counter)) logger.info(("BCL to FASTQ conversion and demultiplexing started for " "10X run {} on {}".format(os.path.basename(self.id), datetime.now()))) bcl2fastq_cmd_counter += 1 return True
def archive_to_swestore(days, run=None, max_runs=None, force=False, compress_only=False): """Send runs (as archives) in NAS nosync to swestore for backup :param int days: number fo days to check threshold :param str run: specific run to send swestore :param int max_runs: number of runs to be processed simultaneously :param bool force: Force the archiving even if the run is not complete :param bool compress_only: Compress the run without sending it to swestore """ # If the run is specified in the command line, check that exists and archive if run: run = os.path.basename(run) base_dir = os.path.dirname(run) if re.match(filesystem.RUN_RE, run): # If the parameter is not an absolute path, find the run in the archive_dirs if not base_dir: for archive_dir in CONFIG.get("storage").get("archive_dirs"): if os.path.exists(os.path.join(archive_dir, run)): base_dir = archive_dir if not os.path.exists(os.path.join(base_dir, run)): logger.error( ( "Run {} not found. Please make sure to specify " "the absolute path or relative path being in " "the correct directory.".format(run) ) ) else: with filesystem.chdir(base_dir): _archive_run((run, days, force, compress_only)) else: logger.error("The name {} doesn't look like an Illumina run".format(os.path.basename(run))) # Otherwise find all runs in every data dir on the nosync partition else: logger.info("Archiving old runs to SWESTORE") for to_send_dir in CONFIG.get("storage").get("archive_dirs"): logger.info("Checking {} directory".format(to_send_dir)) with filesystem.chdir(to_send_dir): to_be_archived = [ r for r in os.listdir(to_send_dir) if re.match(filesystem.RUN_RE, r) and not os.path.exists("{}.archiving".format(r.split(".")[0])) ] if to_be_archived: pool = Pool(processes=len(to_be_archived) if not max_runs else max_runs) pool.map_async(_archive_run, ((run, days, force, compress_only) for run in to_be_archived)) pool.close() pool.join() else: logger.info("No old runs to be archived")
def cleanup_processing(days): """Cleanup runs in processing server. :param int days: Number of days to consider a run to be old """ transfer_file = os.path.join(CONFIG.get("preprocessing", {}).get("status_dir"), "transfer.tsv") if not days: days = CONFIG.get("cleanup", {}).get("processing-server", {}).get("days", 10) try: # Move finished runs to nosync for data_dir in CONFIG.get("storage").get("data_dirs"): logger.info("Moving old runs in {}".format(data_dir)) with filesystem.chdir(data_dir): for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]: if filesystem.is_in_file(transfer_file, run): logger.info("Moving run {} to nosync directory".format(os.path.basename(run))) shutil.move(run, "nosync") else: logger.info( ("Run {} has not been transferred to the analysis " "server yet, not archiving".format(run)) ) # Remove old runs from archiving dirs for archive_dir in CONFIG.get("storage").get("archive_dirs").values(): logger.info("Removing old runs in {}".format(archive_dir)) with filesystem.chdir(archive_dir): for run in [r for r in os.listdir(archive_dir) if re.match(filesystem.RUN_RE, r)]: rta_file = os.path.join(run, "RTAComplete.txt") if os.path.exists(rta_file): # 1 day == 60*60*24 seconds --> 86400 if os.stat(rta_file).st_mtime < time.time() - (86400 * days) and filesystem.is_in_swestore( "{}.tar.bz2".format(run) ): logger.info("Removing run {} to nosync directory".format(os.path.basename(run))) shutil.rmtree(run) else: logger.info( "RTAComplete.txt file exists but is not older than {} day(s), skipping run {}".format( str(days), run ) ) except IOError: sbj = "Cannot archive old runs in processing server" msg = "Could not find transfer.tsv file, so I cannot decide if I should " "archive any run or not." cnt = CONFIG.get("contact", None) if not cnt: cnt = "{}@localhost".format(getpass.getuser()) logger.error(msg) misc.send_mail(sbj, msg, cnt)
def demultiplex(self): """Perform demultiplexing of the flowcell. Takes software (bcl2fastq version to use) and parameters from the configuration file. """ logger.info('Building bcl2fastq command') config = CONFIG['analysis'] with chdir(self.run_dir): cl = [config.get('bcl2fastq').get(self.run_type)] if config['bcl2fastq'].has_key('options'): cl_options = config['bcl2fastq']['options'] # Append all options that appear in the configuration file to the main command. # Options that require a value, i.e --use-bases-mask Y8,I8,Y8, will be returned # as a dictionary, while options that doesn't require a value, i.e --no-lane-splitting # will be returned as a simple string for option in cl_options: if isinstance(option, dict): opt, val = option.popitem() cl.extend(['--{}'.format(opt), str(val)]) else: cl.append('--{}'.format(option)) logger.info(("BCL to FASTQ conversion and demultiplexing started for " " run {} on {}".format(os.path.basename(self.id), datetime.now()))) misc.call_external_command_detached(cl, with_log_files=True)
def cleanup_processing(seconds): """Cleanup runs in processing server. :param int seconds: Days/hours converted as second to consider a run to be old """ try: #Remove old runs from archiving dirs for archive_dir in CONFIG.get('storage').get('archive_dirs').values(): logger.info('Removing old runs in {}'.format(archive_dir)) with filesystem.chdir(archive_dir): for run in [r for r in os.listdir(archive_dir) if re.match(filesystem.RUN_RE, r)]: rta_file = os.path.join(run, finished_run_indicator) if os.path.exists(rta_file): if os.stat(rta_file).st_mtime < time.time() - seconds: logger.info('Removing run {} to nosync directory'.format(os.path.basename(run))) shutil.rmtree(run) else: logger.info('{} file exists but is not older than given time, skipping run {}'.format( finished_run_indicator, run)) except IOError: sbj = "Cannot archive old runs in processing server" msg = ("Could not find transfer.tsv file, so I cannot decide if I should " "archive any run or not.") cnt = CONFIG.get('contact', None) if not cnt: cnt = "{}@localhost".format(getpass.getuser()) logger.error(msg) misc.send_mail(sbj, msg, cnt)
def generate_bcl_command(self, lanes, bcl2fastq_cmd_counter, is_10X=False): #I have everything to run demultiplexing now. logger.info('Building a bcl2fastq command') per_lane_base_masks = self._generate_per_lane_base_mask() with chdir(self.run_dir): cl = [self.CONFIG.get('bcl2fastq')['bin']] output_dir = "Demultiplexing_{}".format(bcl2fastq_cmd_counter) cl.extend(["--output-dir", output_dir]) if not os.path.exists(output_dir): os.makedirs(output_dir) cl_options = [] if self.CONFIG.get('bcl2fastq').has_key('options'): for option in self.CONFIG['bcl2fastq']['options']: cl_options.extend([option]) # Add the extra 10X command options if we have a 10X run if is_10X: cl_options.extend(self.CONFIG['bcl2fastq']['options_10X']) # Append all options that appear in the configuration file to the main command. for option in cl_options: if isinstance(option, dict): opt, val = option.items()[0] if "output-dir" not in opt: cl.extend(['--{}'.format(opt), str(val)]) else: cl.append('--{}'.format(option)) cl.extend(["--sample-sheet", os.path.join(os.path.join(self.run_dir, "SampleSheet_{}.csv".format(bcl2fastq_cmd_counter)))]) #now add the base_mask for each lane for lane in sorted(lanes): #Iterate thorugh each lane and add the correct --use-bases-mask for that lane base_mask = [per_lane_base_masks[lane][bm]['base_mask'] for bm in per_lane_base_masks[lane]][0] # get the base_mask base_mask_expr = "{}:".format(lane) + ",".join(base_mask) cl.extend(["--use-bases-mask", base_mask_expr]) return cl
def create_report(self): """ Create a sample report and an aggregate report via a system call """ logprefix = os.path.abspath( self.expand_path(os.path.join(self.logpath, "{}-{}".format( self.projectid, self.sampleid)))) try: if not create_folder(os.path.dirname(logprefix)): logprefix = None except AttributeError: logprefix = None with chdir(self.expand_path(self.reportpath)): # create the ign_sample_report for this sample cl = self.report_sample.split(' ') cl.extend(["--samples",self.sampleid]) call_external_command( cl, with_log_files=(logprefix is not None), prefix="{}_sample".format(logprefix)) # estimate the delivery date for this sample to 0.5 days ahead cl = self.report_aggregate.split(' ') cl.extend([ "--samples_extra", json.dumps({ self.sampleid: { "delivered": "{}(expected)".format( _timestamp(days=0.5))}}) ]) call_external_command( cl, with_log_files=(logprefix is not None), prefix="{}_aggregate".format(logprefix))
def create_report(self): """ Create a sample report and an aggregate report via a system call """ logprefix = os.path.abspath( self.expand_path( os.path.join(self.logpath, "{}-{}".format(self.projectid, self.sampleid)))) try: if not create_folder(os.path.dirname(logprefix)): logprefix = None except AttributeError: logprefix = None with chdir(self.expand_path(self.reportpath)): # create the ign_sample_report for this sample cl = self.report_sample.split(' ') cl.extend(["--samples", self.sampleid]) call_external_command(cl, with_log_files=(logprefix is not None), prefix="{}_sample".format(logprefix)) # estimate the delivery date for this sample to 0.5 days ahead cl = self.report_aggregate.split(' ') cl.extend([ "--samples_extra", json.dumps({ self.sampleid: { "delivered": "{}(expected)".format(_timestamp(days=0.5)) } }) ]) call_external_command(cl, with_log_files=(logprefix is not None), prefix="{}_aggregate".format(logprefix))
def cleanup_processing(seconds): """Cleanup runs in processing server. :param int seconds: Days/hours converted as second to consider a run to be old """ try: #Remove old runs from archiving dirs for archive_dir in CONFIG.get('storage').get('archive_dirs').values(): logger.info('Removing old runs in {}'.format(archive_dir)) with filesystem.chdir(archive_dir): for run in [ r for r in os.listdir(archive_dir) if re.match(filesystem.RUN_RE, r) ]: rta_file = os.path.join(run, finished_run_indicator) if os.path.exists(rta_file): if os.stat(rta_file).st_mtime < time.time() - seconds: logger.info( 'Removing run {} to nosync directory'.format( os.path.basename(run))) shutil.rmtree(run) else: logger.info( '{} file exists but is not older than given time, skipping run {}' .format(finished_run_indicator, run)) except IOError: sbj = "Cannot archive old runs in processing server" msg = ( "Could not find transfer.tsv file, so I cannot decide if I should " "archive any run or not.") cnt = CONFIG.get('contact', None) if not cnt: cnt = "{}@localhost".format(getpass.getuser()) logger.error(msg) misc.send_mail(sbj, msg, cnt)
def demultiplex_run(self): """ Demultiplex a Xten run: - find the samplesheet - make a local copy of the samplesheet and name it SampleSheet.csv - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep) - run bcl2fastq conversion """ ssname = self._get_samplesheet() ssparser = SampleSheetParser(ssname) #samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default) #if this is not the case then create it and take special care of modification to be done on the SampleSheet samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv") #check that the samplesheet is not already present. In this case go the next step if not os.path.exists(samplesheet_dest): try: with open(samplesheet_dest, 'wb') as fcd: fcd.write(_generate_clean_samplesheet(ssparser, fields_to_remove=['index2'], rename_samples=True, rename_qPCR_suffix = True, fields_qPCR=['SampleName'])) except Exception as e: logger.error(e.text) return False logger.info(("Created SampleSheet.csv for Flowcell {} in {} ".format(self.id, samplesheet_dest))) ##SampleSheet.csv generated ##when demultiplexing SampleSheet.csv is the one I need to use self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, "SampleSheet.csv")) per_lane_base_masks = self._generate_per_lane_base_mask() max_different_base_masks = max([len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks]) if max_different_base_masks > 1: # in a HiSeqX run I cannot have different index sizes in the SAME lane logger.error("In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \ in the same lane".format(self.id)) return False #I have everything to run demultiplexing now. logger.info('Building bcl2fastq command') with chdir(self.run_dir): cl = [self.CONFIG.get('bcl2fastq')['bin']] if self.CONFIG.get('bcl2fastq').has_key('options'): cl_options = self.CONFIG['bcl2fastq']['options'] # Append all options that appear in the configuration file to the main command. for option in cl_options: if isinstance(option, dict): opt, val = option.items()[0] cl.extend(['--{}'.format(opt), str(val)]) else: cl.append('--{}'.format(option)) #now add the base_mask for each lane for lane in sorted(per_lane_base_masks): #iterate thorugh each lane and add the correct --use-bases-mask for that lane #there is a single basemask for each lane, I checked it a couple of lines above base_mask = [per_lane_base_masks[lane][bm]['base_mask'] for bm in per_lane_base_masks[lane]][0] # get the base_mask base_mask_expr = "{}:".format(lane) + ",".join(base_mask) cl.extend(["--use-bases-mask", base_mask_expr]) logger.info(("BCL to FASTQ conversion and demultiplexing started for " " run {} on {}".format(os.path.basename(self.id), datetime.now()))) misc.call_external_command_detached(cl, with_log_files=True) return True
def cleanup_nas(seconds): """Will move the finished runs in NASes to nosync directory. :param int seconds: Days/hours converted as second to consider a run to be old """ couch_info = CONFIG.get('statusdb') mail_recipients = CONFIG.get('mail', {}).get('recipients') check_demux = CONFIG.get('storage', {}).get('check_demux', False) host_name = os.getenv('HOSTNAME', os.uname()[1]).split('.', 1)[0] for data_dir in CONFIG.get('storage').get('data_dirs'): logger.info('Moving old runs in {}'.format(data_dir)) with filesystem.chdir(data_dir): for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]: rta_file = os.path.join(run, finished_run_indicator) if os.path.exists(rta_file): if check_demux: if misc.run_is_demuxed(run, couch_info): logger.info('Moving run {} to nosync directory'.format(os.path.basename(run))) shutil.move(run, 'nosync') elif os.stat(rta_file).st_mtime < time.time() - seconds: logger.warn('Run {} is older than given time, but it is not demultiplexed yet' .format(run)) sbt = "Run not demultiplexed - {}".format(run) msg = ("Run '{}' in '{}' is older then given threshold, but seems like it is not " "yet demultiplexed".format(os.path.join(data_dir, run), host_name)) misc.send_mail(sbt, msg, mail_recipients) else: if os.stat(rta_file).st_mtime < time.time() - seconds: logger.info('Moving run {} to nosync directory'.format(os.path.basename(run))) shutil.move(run, 'nosync') else: logger.info('{} file exists but is not older than given time, skipping run {}' .format(finished_run_indicator, run))
def cleanup_processing(days): """Cleanup runs in processing server. :param int days: Number of days to consider a run to be old """ transfer_file = os.path.join(CONFIG.get('preprocessing', {}).get('status_dir'), 'transfer.tsv') if not days: days = CONFIG.get('cleanup', {}).get('processing-server', {}).get('days', 10) try: #Move finished runs to nosync for data_dir in CONFIG.get('storage').get('data_dirs'): logger.info('Moving old runs in {}'.format(data_dir)) with filesystem.chdir(data_dir): for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]: if filesystem.is_in_file(transfer_file, run): logger.info('Moving run {} to nosync directory' .format(os.path.basename(run))) shutil.move(run, 'nosync') else: logger.info(("Run {} has not been transferred to the analysis " "server yet, not archiving".format(run))) #Remove old runs from archiving dirs for archive_dir in CONFIG.get('storage').get('archive_dirs').values(): logger.info('Removing old runs in {}'.format(archive_dir)) with filesystem.chdir(archive_dir): for run in [r for r in os.listdir(archive_dir) if re.match(filesystem.RUN_RE, r)]: rta_file = os.path.join(run, 'RTAComplete.txt') if os.path.exists(rta_file): # 1 day == 60*60*24 seconds --> 86400 if os.stat(rta_file).st_mtime < time.time() - (86400 * days) and \ filesystem.is_in_swestore("{}.tar.bz2".format(run)): logger.info('Removing run {} to nosync directory' .format(os.path.basename(run))) shutil.rmtree(run) else: logger.info('RTAComplete.txt file exists but is not older than {} day(s), skipping run {}'.format(str(days), run)) except IOError: sbj = "Cannot archive old runs in processing server" msg = ("Could not find transfer.tsv file, so I cannot decide if I should " "archive any run or not.") cnt = CONFIG.get('contact', None) if not cnt: cnt = "{}@localhost".format(getpass.getuser()) logger.error(msg) misc.send_mail(sbj, msg, cnt)
def generate_bcl_command(self, sample_type, mask_table, bcl2fastq_cmd_counter): # I have everything to run demultiplexing now. logger.info('Building a bcl2fastq command') per_lane_base_masks = self._generate_per_lane_base_mask( sample_type, mask_table) with chdir(self.run_dir): cl = [self.CONFIG.get('bcl2fastq')['bin']] output_dir = 'Demultiplexing_{}'.format(bcl2fastq_cmd_counter) cl.extend(['--output-dir', output_dir]) if not os.path.exists(output_dir): os.makedirs(output_dir) cl_options = [] if 'options' in self.CONFIG.get('bcl2fastq'): for option in self.CONFIG['bcl2fastq']['options']: cl_options.extend([option]) # Add the extra 10X command options if we have 10X Genomic or ATAC samples if sample_type == '10X_GENO' or sample_type == '10X_ATAC': cl_options.extend(self.CONFIG['bcl2fastq']['options_10X']) # Add the extra 10X command options if we have 10X ST samples if sample_type == '10X_ST': cl_options.extend( self.CONFIG['bcl2fastq']['options_10X_ST']) # Add the extra command option if we have samples with IDT UMI if sample_type == 'IDT_UMI': cl_options.extend( self.CONFIG['bcl2fastq']['options_IDT_UMI']) # Add the extra Smart-seq command options if we have 10X ST samples if sample_type == 'SMARTSEQ': cl_options.extend( self.CONFIG['bcl2fastq']['options_SMARTSEQ']) # Append all options that appear in the configuration file to the main command. for option in cl_options: if isinstance(option, dict): opt, val = list(option.items())[0] if 'output-dir' not in opt: cl.extend(['--{}'.format(opt), str(val)]) else: cl.append('--{}'.format(option)) cl.extend([ '--sample-sheet', os.path.join( os.path.join( self.run_dir, 'SampleSheet_{}.csv'.format(bcl2fastq_cmd_counter))) ]) # Add the base_mask for each lane lanes = list(mask_table.keys()) for lane in sorted(lanes): # Iterate thorugh each lane and add the correct --use-bases-mask for that lane base_mask = [ per_lane_base_masks[lane][bm]['base_mask'] for bm in per_lane_base_masks[lane] ][0] # Get the base_mask base_mask_expr = '{}:'.format(lane) + ','.join(base_mask) cl.extend(['--use-bases-mask', base_mask_expr]) return cl
def cleanup_nas(seconds): """Will move the finished runs in NASes to nosync directory. :param int seconds: Days/hours converted as second to consider a run to be old """ couch_info = CONFIG.get('statusdb') mail_recipients = CONFIG.get('mail', {}).get('recipients') check_demux = CONFIG.get('storage', {}).get('check_demux', False) host_name = os.getenv('HOSTNAME', os.uname()[1]).split('.', 1)[0] for data_dir in CONFIG.get('storage').get('data_dirs'): if not os.path.exists(data_dir) or not os.path.isdir(data_dir): logger.warn( "Data directory '{}' does not exist or not a directory".format( data_dir)) continue logger.info('Moving old runs in {}'.format(data_dir)) with filesystem.chdir(data_dir): for run in [ r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r) ]: rta_file = os.path.join(run, finished_run_indicator) if os.path.exists(rta_file): if check_demux: if misc.run_is_demuxed(run, couch_info): logger.info( 'Moving run {} to nosync directory'.format( os.path.basename(run))) shutil.move(run, 'nosync') elif 'miseq' in data_dir: miseq_run = MiSeq_Run(run, CONFIG) if miseq_run.get_run_type() == 'NON-NGI-RUN': logger.info( 'Run {} is a non-platform run, so moving it to nosync directory' .format(os.path.basename(run))) shutil.move(run, 'nosync') elif os.stat( rta_file).st_mtime < time.time() - seconds: logger.warn( 'Run {} is older than given time, but it is not demultiplexed yet' .format(run)) sbt = "Run not demultiplexed - {}".format(run) msg = ( "Run '{}' in '{}' is older then given threshold, but seems like it is not " "yet demultiplexed".format( os.path.join(data_dir, run), host_name)) misc.send_mail(sbt, msg, mail_recipients) else: if os.stat(rta_file).st_mtime < time.time() - seconds: logger.info( 'Moving run {} to nosync directory'.format( os.path.basename(run))) shutil.move(run, 'nosync') else: logger.info( '{} file exists but is not older than given time, skipping run {}' .format(finished_run_indicator, run))
def cleanup_uppmax(site, days, dry_run=False): """Remove project/run that have been closed more than 'days' from the given 'site' on uppmax :param str site: site where the cleanup should be performed :param int days: number of days to check for closed projects """ days = check_days(site, days, config) if not days: return root_dir = CONFIG.get("cleanup").get(site).get("root") deleted_log = CONFIG.get("cleanup").get("deleted_log") assert os.path.exists(os.path.join(root_dir, deleted_log)), "Log directory {} doesn't exist in {}".format( deleted_log, root_dir ) log_file = os.path.join(root_dir, "{fl}/{fl}.log".format(fl=deleted_log)) # make a connection for project db # pcon = statusdb.ProjectSummaryConnection() assert pcon, "Could not connect to project database in StatusDB" if site != "archive": ## work flow for cleaning up illumina/analysis ## projects = [p for p in os.listdir(root_dir) if re.match(filesystem.PROJECT_RE, p)] list_to_delete = get_closed_projects(projects, pcon, days) else: ##work flow for cleaning archive ## list_to_delete = [] archived_in_swestore = filesystem.list_runs_in_swestore( path=CONFIG.get("cleanup").get("swestore").get("root"), no_ext=True ) runs = [r for r in os.listdir(root_dir) if re.match(filesystem.RUN_RE, r)] with filesystem.chdir(root_dir): for run in runs: fc_date = run.split("_")[0] if misc.days_old(fc_date) > days: if run in archived_in_swestore: list_to_delete.append(run) else: logger.warn( "Run {} is older than {} days but not in " "swestore, so SKIPPING".format(run, days) ) ## delete and log for item in list_to_delete: if dry_run: logger.info("Will remove {} from {}".format(item, root_dir)) continue try: shutil.rmtree(os.path.join(root_dir, item)) logger.info("Removed project {} from {}".format(item, root_dir)) with open(log_file, "a") as to_log: to_log.write("{}\t{}\n".format(item, datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M"))) except OSError: logger.warn("Could not remove path {} from {}".format(item, root_dir)) continue
def cleanup_uppmax(site, days, dry_run=False): """Remove project/run that have been closed more than 'days' from the given 'site' on uppmax :param str site: site where the cleanup should be performed :param int days: number of days to check for closed projects """ days = check_days(site, days, config) if not days: return root_dir = CONFIG.get('cleanup').get(site).get('root') deleted_log = CONFIG.get('cleanup').get('deleted_log') assert os.path.exists(os.path.join(root_dir,deleted_log)), "Log directory {} doesn't exist in {}".format(deleted_log,root_dir) log_file = os.path.join(root_dir,"{fl}/{fl}.log".format(fl=deleted_log)) # make a connection for project db # pcon = statusdb.ProjectSummaryConnection() assert pcon, "Could not connect to project database in StatusDB" if site != "archive": ## work flow for cleaning up illumina/analysis ## projects = [ p for p in os.listdir(root_dir) if re.match(filesystem.PROJECT_RE,p) ] list_to_delete = get_closed_projects(projects, pcon, days) else: ##work flow for cleaning archive ## list_to_delete = [] archived_in_swestore = filesystem.list_runs_in_swestore(path=CONFIG.get('cleanup').get('swestore').get('root'), no_ext=True) runs = [ r for r in os.listdir(root_dir) if re.match(filesystem.RUN_RE,r) ] with filesystem.chdir(root_dir): for run in runs: fc_date = run.split('_')[0] if misc.days_old(fc_date) > days: if run in archived_in_swestore: list_to_delete.append(run) else: logger.warn("Run {} is older than {} days but not in " "swestore, so SKIPPING".format(run, days)) ## delete and log for item in list_to_delete: if dry_run: logger.info('Will remove {} from {}'.format(item,root_dir)) continue try: shutil.rmtree(os.path.join(root_dir,item)) logger.info('Removed project {} from {}'.format(item,root_dir)) with open(log_file,'a') as to_log: to_log.write("{}\t{}\n".format(item,datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M'))) except OSError: logger.warn("Could not remove path {} from {}" .format(item,root_dir)) continue
def demultiplex_run(self): """ Demultiplex a NextSeq run: - find the samplesheet - make a local copy of the samplesheet and name it SampleSheet.csv - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep) - run bcl2fastq conversion """ if not os.path.exists(self.ssname): # We should not get here really and this run should be defined as NON NGI-RUN return False # TODO SampleSheetParser may throw an exception ssparser = SampleSheetParser(self.ssname) # Samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default) # if this is not the case then create it and take special care of modification to be done on the SampleSheet samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv") # Check that the samplesheet is not already present. In this case go the next step if not os.path.exists(samplesheet_dest): try: with open(samplesheet_dest, 'wb') as fcd: fcd.write(self._generate_clean_samplesheet(ssparser)) except Exception as e: if os.path.exists(samplesheet_dest): os.remove(samplesheet_dest) logger.error(e) return False logger.info( ("Created SampleSheet.csv for Flowcell {} in {} ".format( self.id, samplesheet_dest))) # SampleSheet.csv generated to be used in bcl2fastq self.runParserObj.samplesheet = SampleSheetParser( os.path.join(self.run_dir, "SampleSheet.csv")) # Make the demux call with chdir(self.run_dir): cl = [self.CONFIG.get('bcl2fastq')['bin']] if self.CONFIG.get('bcl2fastq').has_key('options'): cl_options = self.CONFIG['bcl2fastq']['options'] # Append all options that appear in the configuration file to the main command. for option in cl_options: if isinstance(option, dict): opt, val = option.items()[0] cl.extend(['--{}'.format(opt), str(val)]) else: cl.append('--{}'.format(option)) logger.info( ("BCL to FASTQ conversion and demultiplexing started for " " run {} on {}".format(os.path.basename(self.id), datetime.now()))) misc.call_external_command_detached(cl, with_log_files=True) return True
def create_report(self): """ Create a final aggregate report via a system call """ logprefix = os.path.abspath( self.expand_path(os.path.join(self.logpath, self.projectid))) try: if not create_folder(os.path.dirname(logprefix)): logprefix = None except AttributeError: logprefix = None with chdir(self.expand_path(self.reportpath)): cl = self.report_aggregate.split(' ') call_external_command(cl, with_log_files=(logprefix is not None), prefix="{}_aggregate".format(logprefix))
def create_report(self): """ Create a final aggregate report via a system call """ logprefix = os.path.abspath( self.expand_path(os.path.join(self.logpath, self.projectid))) try: if not create_folder(os.path.dirname(logprefix)): logprefix = None except AttributeError: logprefix = None with chdir(self.expand_path(self.reportpath)): cl = self.report_aggregate.split(' ') call_external_command( cl, with_log_files=(logprefix is not None), prefix="{}_aggregate".format(logprefix))
def demultiplex_run(self): """ Demultiplex a NextSeq run: - find the samplesheet - make a local copy of the samplesheet and name it SampleSheet.csv - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep) - run bcl2fastq conversion """ if not os.path.exists(self.ssname): # We should not get here really and this run should be defined as NON NGI-RUN return False # TODO SampleSheetParser may throw an exception ssparser = SampleSheetParser(self.ssname) # Samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default) # if this is not the case then create it and take special care of modification to be done on the SampleSheet samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv") # Check that the samplesheet is not already present. In this case go the next step if not os.path.exists(samplesheet_dest): try: with open(samplesheet_dest, 'wb') as fcd: fcd.write(self._generate_clean_samplesheet(ssparser)) except Exception as e: if os.path.exists(samplesheet_dest): os.remove(samplesheet_dest) logger.error(e) return False logger.info(("Created SampleSheet.csv for Flowcell {} in {} " .format(self.id, samplesheet_dest))) # SampleSheet.csv generated to be used in bcl2fastq self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, "SampleSheet.csv")) # Make the demux call with chdir(self.run_dir): cl = [self.CONFIG.get('bcl2fastq')['bin']] if self.CONFIG.get('bcl2fastq').has_key('options'): cl_options = self.CONFIG['bcl2fastq']['options'] # Append all options that appear in the configuration file to the main command. for option in cl_options: if isinstance(option, dict): opt, val = option.items()[0] cl.extend(['--{}'.format(opt), str(val)]) else: cl.append('--{}'.format(option)) logger.info(("BCL to FASTQ conversion and demultiplexing started for " " run {} on {}".format(os.path.basename(self.id), datetime.now()))) misc.call_external_command_detached(cl, with_log_files=True) return True
def cleanup_nas(days): """Will move the finished runs in NASes to nosync directory. :param int days: Number of days to consider a run to be old """ for data_dir in CONFIG.get('storage').get('data_dirs'): logger.info('Moving old runs in {}'.format(data_dir)) with filesystem.chdir(data_dir): for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]: rta_file = os.path.join(run, 'RTAComplete.txt') if os.path.exists(rta_file): # 1 day == 60*60*24 seconds --> 86400 if os.stat(rta_file).st_mtime < time.time() - (86400 * days): logger.info('Moving run {} to nosync directory' .format(os.path.basename(run))) shutil.move(run, 'nosync') else: logger.info('RTAComplete.txt file exists but is not older than {} day(s), skipping run {}'.format(str(days), run))
def cleanup_nas(days): """Will move the finished runs in NASes to nosync directory. :param int days: Number of days to consider a run to be old """ for data_dir in CONFIG.get('storage').get('data_dirs'): logger.info('Moving old runs in {}'.format(data_dir)) with filesystem.chdir(data_dir): for run in [r for r in os.listdir(data_dir) if re.match(filesystem.RUN_RE, r)]: rta_file = os.path.join(run, finished_run_indicator) if os.path.exists(rta_file): # 1 day == 60*60*24 seconds --> 86400 if os.stat(rta_file).st_mtime < time.time() - (86400 * days): logger.info('Moving run {} to nosync directory' .format(os.path.basename(run))) shutil.move(run, 'nosync') else: logger.info('{} file exists but is not older than {} day(s), skipping run {}'.format( finished_run_indicator, str(days), run))
def generate_bcl_command(self, lanes, bcl2fastq_cmd_counter, is_10X=False): #I have everything to run demultiplexing now. logger.info('Building a bcl2fastq command') per_lane_base_masks = self._generate_per_lane_base_mask() with chdir(self.run_dir): cl = [self.CONFIG.get('bcl2fastq')['bin']] output_dir = "Demultiplexing_{}".format(bcl2fastq_cmd_counter) cl.extend(["--output-dir", output_dir]) if not os.path.exists(output_dir): os.makedirs(output_dir) cl_options = [] if self.CONFIG.get('bcl2fastq').has_key('options'): for option in self.CONFIG['bcl2fastq']['options']: cl_options.extend([option]) # Add the extra 10X command options if we have a 10X run if is_10X: cl_options.extend(self.CONFIG['bcl2fastq']['options_10X']) # Append all options that appear in the configuration file to the main command. for option in cl_options: if isinstance(option, dict): opt, val = option.items()[0] if "output-dir" not in opt: cl.extend(['--{}'.format(opt), str(val)]) else: cl.append('--{}'.format(option)) cl.extend([ "--sample-sheet", os.path.join( os.path.join( self.run_dir, "SampleSheet_{}.csv".format(bcl2fastq_cmd_counter))) ]) #now add the base_mask for each lane for lane in sorted(lanes): #Iterate thorugh each lane and add the correct --use-bases-mask for that lane base_mask = [ per_lane_base_masks[lane][bm]['base_mask'] for bm in per_lane_base_masks[lane] ][0] # get the base_mask base_mask_expr = "{}:".format(lane) + ",".join(base_mask) cl.extend(["--use-bases-mask", base_mask_expr]) return cl
def pdc_put(cls, run): """Archive the collected runs to PDC""" bk = cls(run) bk.collect_runs(ext=".tar.gz.gpg", filter_by_ext=True) logger.info("In total, found {} run(s) to send PDC".format(len(bk.runs))) for run in bk.runs: run.flag = "{}.archiving".format(run.name) run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted) if run.path not in bk.archive_dirs.values(): logger.error(("Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate " "archive dir before sending it to PDC".format(",".join(bk.archive_dirs.values()), run.name))) continue if not os.path.exists(run.dst_key_encrypted): logger.error("Encrypted key file {} is not found for file {}, skipping it".format(run.dst_key_encrypted, run.zip_encrypted)) continue #skip run if being encrypted if os.path.exists("{}.encrypting".format(run.name)): logger.warn("Run {} is currently being encrypted, so skipping now".format(run.name)) continue # skip run if already ongoing if os.path.exists(run.flag): logger.warn("Run {} is already being archived, so skipping now".format(run.name)) continue flag = open(run.flag, 'w').close() with filesystem.chdir(run.path): if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc(run.dst_key_encrypted, silent=False): logger.warn("Seems like files realted to run {} already exist in PDC, check and cleanup".format(run.name)) bk._clean_tmp_files([run.flag]) continue logger.info("Sending file {} to PDC".format(run.zip_encrypted)) if bk._call_commands(cmd1="dsmc archive {}".format(run.zip_encrypted), tmp_files=[run.flag]): time.sleep(15) # give some time just in case 'dsmc' needs to settle if bk._call_commands(cmd1="dsmc archive {}".format(run.dst_key_encrypted), tmp_files=[run.flag]): time.sleep(5) # give some time just in case 'dsmc' needs to settle if bk.file_in_pdc(run.zip_encrypted) and bk.file_in_pdc(run.dst_key_encrypted): logger.info("Successfully sent file {} to PDC, removing file locally from {}".format(run.zip_encrypted, run.path)) if bk.couch_info: bk._log_pdc_statusdb(run.name) bk._clean_tmp_files([run.zip_encrypted, run.dst_key_encrypted, run.flag]) continue logger.warn("Sending file {} to PDC failed".format(run.zip_encrypted))
def pdc_put(cls, run): """Archive the collected runs to PDC.""" bk = cls(run) bk.collect_runs(ext='.tar.gz.gpg', filter_by_ext=True) logger.info('In total, found {} run(s) to send PDC'.format(len(bk.runs))) for run in bk.runs: run.flag = '{}.archiving'.format(run.name) run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted) if run.path not in bk.archive_dirs.values(): logger.error(('Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate ' 'archive dir before sending it to PDC'.format(','.join(list(bk.archive_dirs.values())), run.name))) continue if not os.path.exists(run.dst_key_encrypted): logger.error('Encrypted key file {} is not found for file {}, skipping it'.format(run.dst_key_encrypted, run.zip_encrypted)) continue with filesystem.chdir(run.path): #skip run if being encrypted if os.path.exists('{}.encrypting'.format(run.name)): logger.warn('Run {} is currently being encrypted, so skipping now'.format(run.name)) continue # skip run if already ongoing if os.path.exists(run.flag): logger.warn('Run {} is already being archived, so skipping now'.format(run.name)) continue if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc(run.dst_key_encrypted, silent=False): logger.warn('Seems like files realted to run {} already exist in PDC, check and cleanup'.format(run.name)) continue flag = open(run.flag, 'w').close() logger.info('Sending file {} to PDC'.format(run.zip_encrypted)) if bk._call_commands(cmd1='dsmc archive {}'.format(run.zip_encrypted), tmp_files=[run.flag]): time.sleep(15) # give some time just in case 'dsmc' needs to settle if bk._call_commands(cmd1='dsmc archive {}'.format(run.dst_key_encrypted), tmp_files=[run.flag]): time.sleep(5) # give some time just in case 'dsmc' needs to settle if bk.file_in_pdc(run.zip_encrypted) and bk.file_in_pdc(run.dst_key_encrypted): logger.info('Successfully sent file {} to PDC, removing file locally from {}'.format(run.zip_encrypted, run.path)) if bk.couch_info: bk._log_pdc_statusdb(run.name) bk._clean_tmp_files([run.zip_encrypted, run.dst_key_encrypted, run.flag]) continue logger.warn('Sending file {} to PDC failed'.format(run.zip_encrypted))
def demultiplex_run(self): """ Demultiplex a HiSeq run: - find the samplesheet - make a local copy of the samplesheet and name it SampleSheet.csv - create multiple SampleSheets in case at least one lane have multiple indexes lengths - run bcl2fastq conversion """ ssname = self._get_samplesheet() if ssname is None: return None ssparser = SampleSheetParser(ssname) #Copy the original samplesheet locally. Copy again if already done as there might have been changes to the samplesheet try: shutil.copy( ssname, os.path.join(self.run_dir, "{}.csv".format(self.flowcell_id))) ssname = os.path.join(self.run_dir, os.path.split(ssname)[1]) except: raise RuntimeError( "unable to copy file {} to destination {}".format( ssname, self.run_dir)) #this sample sheet has been created by the LIMS and copied by a sequencing operator. It is not ready #to be used it needs some editing #this will contain the samplesheet with all the renaiming to be used with bcl2fastq-2.17 samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv") #check that the samplesheet is not already present. In this case go the next step if os.path.exists(samplesheet_dest): logger.info("SampleSheet.csv found ... overwriting it") try: with open(samplesheet_dest, 'wb') as fcd: fcd.write(self._generate_clean_samplesheet(ssparser)) except Exception as e: logger.error(e.text) return False logger.info(("Created SampleSheet.csv for Flowcell {} in {} ".format( self.id, samplesheet_dest))) ##SampleSheet.csv generated ##when demultiplexing SampleSheet.csv is the one I need to use self.runParserObj.samplesheet = SampleSheetParser( os.path.join(self.run_dir, "SampleSheet.csv")) #now geenrate the base masks per lane and decide how to demultiplex per_lane_base_masks = self._generate_per_lane_base_mask() max_different_base_masks = max([ len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks ]) #if max_different is one, then I have a simple config and I can run a single command. Otherwirse I need to run multiples instances #extract lanes with a single base masks simple_lanes = {} complex_lanes = {} for lane in per_lane_base_masks: if len(per_lane_base_masks[lane]) == 1: simple_lanes[lane] = per_lane_base_masks[lane] else: complex_lanes[lane] = per_lane_base_masks[lane] #simple lanes contains the lanes such that there is more than one base mask bcl2fastq_commands = [] bcl2fastq_command_num = 0 if len(simple_lanes) > 0: bcl2fastq_commands.append( self._generate_bcl2fastq_command(simple_lanes, True, bcl2fastq_command_num)) bcl2fastq_command_num += 1 #compute the different masks, there will be one bcl2fastq command per mask base_masks_complex = [ complex_lanes[base_masks].keys() for base_masks in complex_lanes ] different_masks = list( set([item for sublist in base_masks_complex for item in sublist])) for mask in different_masks: base_masks_complex_to_demux = {} for lane in complex_lanes: if complex_lanes[lane].has_key(mask): base_masks_complex_to_demux[lane] = {} base_masks_complex_to_demux[lane][mask] = complex_lanes[ lane][mask] #at this point base_masks_complex_to_demux contains only a base mask for lane. I can build the command bcl2fastq_commands.append( self._generate_bcl2fastq_command(base_masks_complex_to_demux, True, bcl2fastq_command_num)) bcl2fastq_command_num += 1 #now bcl2fastq_commands contains all command to be executed. They can be executed in parallel, however run only one per time in order to avoid to overload the machine with chdir(self.run_dir): # create Demultiplexing dir, in this way the status of this run will became IN_PROGRESS if not os.path.exists("Demultiplexing"): os.makedirs("Demultiplexing") execution = 0 for bcl2fastq_command in bcl2fastq_commands: misc.call_external_command_detached( bcl2fastq_command, with_log_files=True, prefix="demux_{}".format(execution)) execution += 1
def cleanup_milou(site, seconds, dry_run=False): """Remove project/run that have been closed more than given time (as seconds) from the given 'site' on uppmax :param str site: site where the cleanup should be performed :param int seconds: Days/hours converted as second to consider a run to be old :param bool dry_run: Will summarize what is going to be done without really doing it """ seconds = check_default(site, seconds, CONFIG) if not seconds: return root_dir = CONFIG.get('cleanup').get('milou').get(site).get('root') deleted_log = CONFIG.get('cleanup').get('milou').get('deleted_log') assert os.path.exists(os.path.join(root_dir,deleted_log)), "Log directory {} doesn't exist in {}".format(deleted_log,root_dir) log_file = os.path.join(root_dir,"{fl}/{fl}.log".format(fl=deleted_log)) list_to_delete = [] ## get glob path patterns to search and remove from root directory try: archive_config = CONFIG['cleanup']['milou']['archive'] ## the glob path should be relative to the run folder, like "Unaligned_*/Project_*" config_ppath = archive_config['proj_path'] ## Glob path should be relative to run folder, like "Unaligned_0bp/Undetermined_indices/*/*.fastq.gz" config_npath = archive_config['undet_noindex'] ## Glob path should be relative to run folder, like "Unaligned_*bp/Undetermined_indices/*/*.fastq.gz" config_upath = archive_config['undet_all'] except KeyError as e: logger.error("Config file is missing the key {}, make sure it have all required information".format(str(e))) raise SystemExit # make a connection for project db # pcon = statusdb.ProjectSummaryConnection() assert pcon, "Could not connect to project database in StatusDB" if site in ["analysis", "illumina"]: ## work flow for cleaning up illumina/analysis ## projects = [ p for p in os.listdir(root_dir) if re.match(filesystem.PROJECT_RE,p) ] list_to_delete.extend(get_closed_projects(projects, pcon, seconds)) elif site == "archive": ##work flow for cleaning archive ## runs = [ r for r in os.listdir(root_dir) if re.match(filesystem.RUN_RE,r) ] for run in runs: with filesystem.chdir(os.path.join(root_dir, run)): ## Collect all project path from demultiplexed directories in the run folder all_proj_path = glob(config_ppath) all_proj_dict = {os.path.basename(pp).replace('Project_','').replace('__', '.'): pp for pp in all_proj_path} closed_projects = get_closed_projects(all_proj_dict.keys(), pcon, seconds) ## Only proceed cleaning the data for closed projects for closed_proj in closed_projects: closed_proj_fq = glob("{}/*/*.fastq.gz".format(all_proj_dict[closed_proj])) list_to_delete.extend([os.path.join(run, pfile) for pfile in closed_proj_fq]) ## Remove the undetermined fastq files for NoIndex case always undetermined_fastq_files = glob(config_npath) ## Remove undeterminded fastq files for all index length if all project run in the FC is closed if len(all_proj_dict.keys()) == len(closed_projects): undetermined_fastq_files = glob(config_upath) list_to_delete.extend([os.path.join(run, ufile) for ufile in undetermined_fastq_files]) ## delete and log for item in list_to_delete: if dry_run: logger.info('Will remove {} from {}'.format(item,root_dir)) continue try: to_remove = os.path.join(root_dir,item) if os.path.isfile(to_remove): os.remove(to_remove) elif os.path.isdir(to_remove): shutil.rmtree(to_remove) logger.info('Removed {} from {}'.format(item,root_dir)) with open(log_file,'a') as to_log: to_log.write("{}\t{}\n".format(to_remove,datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M'))) except OSError: logger.warn("Could not remove {} from {}".format(item,root_dir)) continue
def cleanup_milou(site, seconds, dry_run=False): """Remove project/run that have been closed more than given time (as seconds) from the given 'site' on uppmax :param str site: site where the cleanup should be performed :param int seconds: Days/hours converted as second to consider a run to be old :param bool dry_run: Will summarize what is going to be done without really doing it """ seconds = check_default(site, seconds, CONFIG) if not seconds: return root_dir = CONFIG.get('cleanup').get('milou').get(site).get('root') deleted_log = CONFIG.get('cleanup').get('milou').get('deleted_log') assert os.path.exists(os.path.join( root_dir, deleted_log)), "Log directory {} doesn't exist in {}".format( deleted_log, root_dir) log_file = os.path.join(root_dir, "{fl}/{fl}.log".format(fl=deleted_log)) list_to_delete = [] ## get glob path patterns to search and remove from root directory try: archive_config = CONFIG['cleanup']['milou']['archive'] ## the glob path should be relative to the run folder, like "Unaligned_*/Project_*" config_ppath = archive_config['proj_path'] ## Glob path should be relative to run folder, like "Unaligned_0bp/Undetermined_indices/*/*.fastq.gz" config_npath = archive_config['undet_noindex'] ## Glob path should be relative to run folder, like "Unaligned_*bp/Undetermined_indices/*/*.fastq.gz" config_upath = archive_config['undet_all'] except KeyError as e: logger.error( "Config file is missing the key {}, make sure it have all required information" .format(str(e))) raise SystemExit # make a connection for project db # pcon = statusdb.ProjectSummaryConnection() assert pcon, "Could not connect to project database in StatusDB" if site in ["analysis", "illumina"]: ## work flow for cleaning up illumina/analysis ## projects = [ p for p in os.listdir(root_dir) if re.match(filesystem.PROJECT_RE, p) ] list_to_delete.extend(get_closed_projects(projects, pcon, seconds)) elif site == "archive": ##work flow for cleaning archive ## runs = [ r for r in os.listdir(root_dir) if re.match(filesystem.RUN_RE, r) ] for run in runs: with filesystem.chdir(os.path.join(root_dir, run)): ## Collect all project path from demultiplexed directories in the run folder all_proj_path = glob(config_ppath) all_proj_dict = { os.path.basename(pp).replace('Project_', '').replace('__', '.'): pp for pp in all_proj_path } closed_projects = get_closed_projects(all_proj_dict.keys(), pcon, seconds) ## Only proceed cleaning the data for closed projects for closed_proj in closed_projects: closed_proj_fq = glob("{}/*/*.fastq.gz".format( all_proj_dict[closed_proj])) list_to_delete.extend( [os.path.join(run, pfile) for pfile in closed_proj_fq]) ## Remove the undetermined fastq files for NoIndex case always undetermined_fastq_files = glob(config_npath) ## Remove undeterminded fastq files for all index length if all project run in the FC is closed if len(all_proj_dict.keys()) == len(closed_projects): undetermined_fastq_files = glob(config_upath) list_to_delete.extend([ os.path.join(run, ufile) for ufile in undetermined_fastq_files ]) ## delete and log for item in list_to_delete: if dry_run: logger.info('Will remove {} from {}'.format(item, root_dir)) continue try: to_remove = os.path.join(root_dir, item) if os.path.isfile(to_remove): os.remove(to_remove) elif os.path.isdir(to_remove): shutil.rmtree(to_remove) logger.info('Removed {} from {}'.format(item, root_dir)) with open(log_file, 'a') as to_log: to_log.write("{}\t{}\n".format( to_remove, datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M'))) except OSError: logger.warn("Could not remove {} from {}".format(item, root_dir)) continue
def cleanup_irma(days_fastq, days_analysis, only_fastq, only_analysis, clean_undetermined, status_db_config, exclude_projects, list_only, date, dry_run=False): """Remove fastq/analysis data for projects that have been closed more than given days (as days_fastq/days_analysis) from the given 'irma' cluster :param int days_fastq: Days to consider to remove fastq files for project :param int days_analysis: Days to consider to remove analysis data for project :param bool only_fastq: Remove only fastq files for closed projects :param bool only_analysis: Remove only analysis data for closed projects :param bool dry_run: Will summarize what is going to be done without really doing it Example for mat for config file cleanup: irma: flowcell: ##this path is nothing but incoming directory, can given multiple paths root: - path/to/flowcells_dir relative_project_source: Demultiplexing undet_file_pattern: "Undetermined_*.fastq.gz" ##this is path where projects are organized data_dir: path/to/data_dir analysis: ##directory where analysis are perfoemed for projects root: path/to/analysis_dir #should be exactly same as the qc folder name and files wished to be removed files_to_remove: piper_ngi: - "*.bam" """ try: config = CONFIG['cleanup']['irma'] flowcell_dir_root = config['flowcell']['root'] flowcell_project_source = config['flowcell']['relative_project_source'] flowcell_undet_files = config['flowcell']['undet_file_pattern'] data_dir = config['data_dir'] analysis_dir = config['analysis']['root'] analysis_data_to_remove = config['analysis']['files_to_remove'] if date: date = datetime.strptime(date, '%Y-%m-%d') except KeyError as e: logger.error( "Config file is missing the key {}, make sure it have all required information" .format(str(e))) raise SystemExit except ValueError as e: logger.error( "Date given with '--date' option is not in required format, see help for more info" ) raise SystemExit # make a connection for project db # pcon = statusdb.ProjectSummaryConnection(conf=status_db_config) assert pcon, "Could not connect to project database in StatusDB" # make exclude project list if provided exclude_list = [] if exclude_projects: if os.path.isfile(exclude_projects): with open(exclude_projects, 'r') as in_file: exclude_list.extend([p.strip() for p in in_file.readlines()]) else: exclude_list.extend(exclude_projects.split(',')) # sanity check for mentioned project to exculde or valid invalid_projects = filter( lambda p: p not in pcon.id_view.keys() and p not in pcon.name_view. keys(), exclude_list) if invalid_projects: logger.error( "'--exclude_projects' was called with some invalid projects '{}', " "provide valid project name/id".format( ",".join(invalid_projects))) raise SystemExit #compile list for project to delete project_clean_list, project_processed_list = ({}, []) if not list_only and not clean_undetermined: logger.info("Building initial project list for removing data..") if only_fastq: logger.info( "Option 'only_fastq' is given, so will not look for analysis data") elif only_analysis: logger.info( "Option 'only_analysis' is given, so will not look for fastq data") if clean_undetermined: all_undet_files = [] for flowcell_dir in flowcell_dir_root: for fc in [ d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d) ]: fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): if not os.path.exists(flowcell_project_source): logger.warn( "Flowcell {} do not contain a '{}' direcotry". format(fc, flowcell_project_source)) continue projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \ not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))] # the above check looked for project directories and also that are not cleaned # so if it could not find any project, means there is no project diretory at all # or all the project directory is already cleaned. Then we can remove the undet if len(projects_in_fc) > 0: continue fc_undet_files = glob( os.path.join(flowcell_project_source, flowcell_undet_files)) if fc_undet_files: logger.info( "All projects was cleaned for FC {}, found {} undeterminded files" .format(fc, len(fc_undet_files))) all_undet_files.extend( map(os.path.abspath, fc_undet_files)) if all_undet_files: undet_size = _def_get_size_unit( sum(map(os.path.getsize, all_undet_files))) if misc.query_yes_no( "In total found {} undetermined files which are {} in size, delete now ?" .format(len(all_undet_files), undet_size), default="no"): removed = _remove_files(all_undet_files) return elif only_analysis: for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \ not os.path.exists(os.path.join(analysis_dir, d, "cleaned"))]: proj_abs_path = os.path.join(analysis_dir, pid) proj_info = get_closed_proj_info( pid, pcon.get_entry(pid, use_id_view=True), date) if proj_info and proj_info['closed_days'] >= days_analysis: # move on if this project has to be excluded if proj_info['name'] in exclude_list or proj_info[ 'pid'] in exclude_list: continue analysis_data, analysis_size = collect_analysis_data_irma( pid, analysis_dir, analysis_data_to_remove) proj_info['analysis_to_remove'] = analysis_data proj_info['analysis_size'] = analysis_size proj_info['fastq_to_remove'] = "not_selected" proj_info['fastq_size'] = 0 project_clean_list[proj_info['name']] = proj_info else: for flowcell_dir in flowcell_dir_root: for fc in [ d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d) ]: fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): if not os.path.exists(flowcell_project_source): logger.warn( "Flowcell {} do not contain a '{}' direcotry". format(fc, flowcell_project_source)) continue projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ if re.match(r'^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$',d) and \ not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))] for _proj in projects_in_fc: proj = re.sub(r'_+', '.', _proj, 1) # if a project is already processed no need of fetching it again from status db if proj in project_processed_list: # if the project is closed more than threshold days collect the fastq files from FC # no need of looking for analysis data as they would have been collected in the first time if proj in project_clean_list and project_clean_list[ proj]['closed_days'] >= days_fastq: fc_fq_files, fq_size = collect_fastq_data_irma( fc_abs_path, os.path.join(flowcell_project_source, _proj)) project_clean_list[proj]['fastq_to_remove'][ 'flowcells'][fc] = fc_fq_files[ 'flowcells'][fc] project_clean_list[proj][ 'fastq_size'] += fq_size continue project_processed_list.append(proj) #by default assume all projects are not old enough for delete fastq_data, analysis_data = ("young", "young") fastq_size, analysis_size = (0, 0) proj_info = get_closed_proj_info( proj, pcon.get_entry(proj), date) if proj_info: # move on if this project has to be excluded if proj_info['name'] in exclude_list or proj_info[ 'pid'] in exclude_list: continue # if project not old enough for fastq files and only fastq files selected move on to next project if proj_info['closed_days'] >= days_fastq: fastq_data, fastq_size = collect_fastq_data_irma( fc_abs_path, os.path.join(flowcell_project_source, _proj), data_dir, proj_info['pid']) if not only_fastq: # if project is old enough for fastq files and not 'only_fastq' try collect analysis files if proj_info['closed_days'] >= days_analysis: analysis_data, analysis_size = collect_analysis_data_irma( proj_info['pid'], analysis_dir, analysis_data_to_remove) # if both fastq and analysis files are not old enough move on if (analysis_data == fastq_data) or ( (not analysis_data or analysis_data == "cleaned") and fastq_data == "young"): continue elif fastq_data == "young": continue else: analysis_data = "not_selected" proj_info['fastq_to_remove'] = fastq_data proj_info['fastq_size'] = fastq_size proj_info['analysis_to_remove'] = analysis_data proj_info['analysis_size'] = analysis_size project_clean_list[proj] = proj_info if not project_clean_list: logger.info("There are no projects to clean") return # list only the project and exit if 'list_only' option is selected if list_only: print "Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size" for p_info in sorted(project_clean_list.values(), key=lambda d: d['closed_days'], reverse=True): print "\t".join([ p_info['name'], p_info['pid'], p_info['bioinfo_responsible'], str(p_info['closed_days']), p_info['closed_date'], _def_get_size_unit(p_info['fastq_size']), _def_get_size_unit(p_info['analysis_size']) ]) raise SystemExit logger.info("Initial list is built with {} projects {}".format( len(project_clean_list), get_files_size_text(project_clean_list))) if misc.query_yes_no("Interactively filter projects for cleanup ?", default="yes"): filtered_project, proj_count = ([], 0) #go through complied project list and remove files for proj, info in project_clean_list.iteritems(): proj_count += 1 if not misc.query_yes_no( "{}Delete files for this project ({}/{})".format( get_proj_meta_info(info, days_fastq), proj_count, len(project_clean_list)), default="no"): logger.info( "Will not remove files for project {}".format(proj)) filtered_project.append(proj) # remove projects that were decided not to delete map(project_clean_list.pop, filtered_project) logger.info("Removed {}/{} projects from initial list".format( len(filtered_project), proj_count)) if not project_clean_list: logger.info("There are no projects to clean after filtering") return logger.info("Final list is created with {} projects {}".format( len(project_clean_list), get_files_size_text(project_clean_list))) if not misc.query_yes_no("Proceed with cleanup ?", default="no"): logger.info("Aborting cleanup") return logger.info("Will start cleaning up project now") for proj, info in project_clean_list.iteritems(): fastq_info = info.get('fastq_to_remove') if fastq_info and isinstance(fastq_info, dict): logger.info("Cleaning fastq files for project {}".format(proj)) fastq_fc = fastq_info.get('flowcells', {}) removed_fc = [] for fc, fc_info in fastq_fc.iteritems(): proj_fc_root = fc_info['proj_root'] logger.info( "Removing fastq files from {}".format(proj_fc_root)) if not dry_run: if _remove_files(fc_info['fq_files']): logger.info( "Removed fastq files from FC {} for project {}, marking it as cleaned" .format(fc, proj)) _touch_cleaned(proj_fc_root) removed_fc.append(fc) if len(fastq_fc) == len(removed_fc): try: proj_data_root = fastq_info['proj_data']['proj_data_root'] logger.info( "All flowcells cleaned for this project, marking it as cleaned in {}" .format(proj_data_root)) _touch_cleaned(proj_data_root) except: pass analysis_info = info.get('analysis_to_remove') if analysis_info and isinstance(analysis_info, dict): proj_analysis_root = analysis_info['proj_analysis_root'] logger.info("cleaning analysis data for project {}".format(proj)) removed_qc = [] for qc, files in analysis_info['analysis_files'].iteritems(): logger.info("Removing files of '{}' from {}".format( qc, proj_analysis_root)) if not dry_run: if _remove_files(files): removed_qc.append(qc) else: logger.warn( "Couldn't remove some files in qc directory '{}'". format(qc)) map(analysis_info['analysis_files'].pop, removed_qc) if len(analysis_info['analysis_files']) == 0: logger.info( "Removed analysis data for project {}, marking it cleaned". format(proj)) _touch_cleaned(proj_analysis_root)
def compute_undetermined(self): """ This function returns true if all demux steps are done and we can proceed to QC For simple lanes with index: no check is done everything needs to be in place for complex lanes: no check is done everything needs to be in place for simple lanes and NoIndex: check if demux counts have been computed, if not compute or return waiting for thir completion """ NoIndexLanes = [ lane["Lane"] for lane in self.runParserObj.samplesheet.data if "NoIndex" in lane["index"] ] if len(NoIndexLanes) == 0: return True # everything is fine I can proceed to QC #otherwise proceed NoIndex_Undetermiend = os.path.join(self.run_dir, "Demultiplexing_NoIndex") if not os.path.exists(NoIndex_Undetermiend): #for these lanes I have no undetermiend as I demux them without index. #now geenrate the base masks per lane per_lane_base_masks = self._generate_per_lane_base_mask() #store here only the NoIndex lanes per_lane_base_masks_NoIndex = {} run_with_no_index = False # use this flag to check that we are not in the C.Daub case for NoIndexLane in NoIndexLanes: per_lane_base_masks_NoIndex[NoIndexLane] = per_lane_base_masks[ NoIndexLane] base_mask_key = per_lane_base_masks[NoIndexLane].keys()[0] new_base_mask = [] if len(per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key] ['base_mask']): #C.Daub_15_01 case, only one sample per lane and no index at all run_with_no_index = True else: for baseMask_element in per_lane_base_masks_NoIndex[ NoIndexLane][base_mask_key]['base_mask']: if baseMask_element.startswith("Y"): new_base_mask.append( baseMask_element.replace("Y", "N")) elif baseMask_element.startswith("N"): new_base_mask.append( baseMask_element.replace("N", "Y")) per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key][ 'base_mask'] = new_base_mask if not run_with_no_index: os.makedirs(NoIndex_Undetermiend) command = self._generate_bcl2fastq_command( per_lane_base_masks_NoIndex, True, "NoIndex", mask_short_adapter_reads=True) with chdir(self.run_dir): misc.call_external_command_detached(command, with_log_files=True, prefix="demux_NoIndex") #return false, as I need to wait to finish the demux for the NoIndex case return False else: #in this case I do not want to start a demux for th eindex, beceause I do not have the index at all #I need to softlink everythin else that is in Stats as I do not want to recompute it missingStatsFiles = glob.glob( os.path.join(self.run_dir, "Demultiplexing_0", "Stats", "*F*L*.txt")) destination = os.path.join(self.run_dir, self.demux_dir, "Stats") for source in missingStatsFiles: source_file_name = os.path.basename(source) if not os.path.exists( os.path.join(destination, source_file_name)): os.symlink(source, os.path.join(destination, source_file_name)) return True else: #in this case it means that I have already started to demux the NoIndex if not os.path.exists( os.path.join(self.run_dir, "Demultiplexing_NoIndex", 'Stats', 'DemultiplexingStats.xml')): #demultiplexing of undetermined is still ongoing logger.info("Demux of NoIndex lanes ongoing") return False else: logger.info("Demux of NoIndex lanes done.") #now I need to produce the files needed in the QC flag_file = os.path.join(NoIndex_Undetermiend, "ongoing") if os.path.exists(flag_file): #it means that a previous instance of TACA is running and still processing this FC logger.info( "Counting of undetermined indexes for NoIndex lanes ongoing" ) return False #now check if the stats have been already computed computed = True for lane_id in NoIndexLanes: demuxSummary_file = os.path.join( self.run_dir, self.demux_dir, "Stats", "DemuxSummaryF1L{}.txt".format(lane_id)) if not os.path.exists(demuxSummary_file): #if does not exists and the ongoing falg is not present, then I need to set computed to False computed = False if computed: #in this case I already computed all the demux stats that I need return True #otherwise I need to comput them open(flag_file, 'a').close( ) # create the flag file indicating I am working on this for lane_id in NoIndexLanes: #count the index occurences, each lane corresponds to one project, a project might have multiple lanes current_lane = [ lane for lane in self.runParserObj.samplesheet.data if lane_id == lane["Lane"] ][0] if current_lane["index"] != "NoIndex": logger.error( "while processing run {} NoIndex lane {}, index {} found in SampleSheet" .format(self.id, lane_id, current_lane["index"])) return False index_counter = {} indexes_fastq1 = glob.glob( os.path.join( NoIndex_Undetermiend, current_lane[ self.runParserObj.samplesheet.dfield_proj], current_lane[ self.runParserObj.samplesheet.dfield_sid], "{}_S?_L00{}_R2_001.fastq.gz".format( current_lane[ self.runParserObj.samplesheet.dfield_snm], lane_id)))[0] indexes_fastq2 = glob.glob( os.path.join( NoIndex_Undetermiend, current_lane[ self.runParserObj.samplesheet.dfield_proj], current_lane[ self.runParserObj.samplesheet.dfield_sid], "{}_S?_L00{}_R3_001.fastq.gz".format( current_lane[ self.runParserObj.samplesheet.dfield_snm], lane_id)))[0] # I assume these two files are always present, maybe it is posisble to have no index with a single index... logger.info( "Computing Undetermiend indexes for NoIndex lane {}". format(lane_id)) zcat = subprocess.Popen(['zcat', indexes_fastq1], stdout=subprocess.PIPE) #this command allows to steam two files, print them line after line separated by a plus awk = subprocess.Popen([ 'awk', 'BEGIN {{OFS="+"}}{{ ("zcat " "{0} " ) | getline line ; print $0,line }}' .format(indexes_fastq2) ], stdout=subprocess.PIPE, stdin=zcat.stdout) #now select only the 2nd line every 4 (i.e., only the index1+index2 line) sed = subprocess.Popen(['sed', '-n', "2~4p"], stdout=subprocess.PIPE, stdin=awk.stdout) zcat.stdout.close() awk.stdout.close() output = sed.communicate()[0] zcat.wait() awk.wait() for barcode in output.split('\n')[:-1]: try: index_counter[barcode] += 1 except KeyError: index_counter[barcode] = 1 demuxSummary_file = os.path.join( self.run_dir, self.demux_dir, "Stats", "DemuxSummaryF1L{}.txt".format(lane_id)) with open(demuxSummary_file, 'w') as demuxSummary_file_fh: demuxSummary_file_fh.write( "### Most Popular Unknown Index Sequences\n") demuxSummary_file_fh.write( "### Columns: Index_Sequence Hit_Count\n") for (index, occ) in sorted(index_counter.items(), key=operator.itemgetter(1), reverse=True): demuxSummary_file_fh.write("{}\t{}\n".format( index, occ)) #I need to fill in the lane and laneBarcode html reports when I demux with NoIndex I do not create many values undeterminedStats = DemuxSummaryParser( os.path.join(self.run_dir, self.demux_dir, "Stats")) sample_data_old = self.runParserObj.lanes.sample_data sample_data_new = [] for lane in sample_data_old: if lane["Lane"] in NoIndexLanes: #in this case I need to fill in new values PF_clusters = undeterminedStats.TOTAL[lane["Lane"]] lane["% One mismatchbarcode"] = '0' lane["% Perfectbarcode"] = '100' lane["% of thelane"] = '100' lane["PF Clusters"] = str(PF_clusters) sample_data_new.append(lane) self.runParserObj.lanes.sample_data = sample_data_new demux_folder = os.path.join(self.run_dir, "Demultiplexing") new_html_report_lane_dir = _create_folder_structure( demux_folder, ["Reports", "html", self.flowcell_id, "all", "all", "all"]) new_html_report_lane = os.path.join(new_html_report_lane_dir, "lane.html") _generate_lane_html(new_html_report_lane, self.runParserObj.lanes) #now do the same for laneBarcode sampleBarcode_data_old = self.runParserObj.lanebarcodes.sample_data sampleBarcode_data_new = [] for sample in sampleBarcode_data_old: if sample["Lane"] in NoIndexLanes: #in this case I need to fill in new values PF_clusters = undeterminedStats.TOTAL[lane["Lane"]] sample["% One mismatchbarcode"] = '0' sample["% Perfectbarcode"] = '100' sample["% of thelane"] = '100' sample["PF Clusters"] = str(PF_clusters) sampleBarcode_data_new.append(sample) self.runParserObj.lanebarcodes.sample_data = sampleBarcode_data_new demux_folder = os.path.join(self.run_dir, "Demultiplexing") new_html_report_sampleBarcode_dir = _create_folder_structure( demux_folder, ["Reports", "html", self.flowcell_id, "all", "all", "all"]) new_html_report_sampleBarcode = os.path.join( new_html_report_sampleBarcode_dir, "laneBarcode.html") _generate_lane_html(new_html_report_sampleBarcode, self.runParserObj.lanebarcodes) os.remove( flag_file ) # remove flag file to allow future iteration on this FC return True #return true, I have done everything I was supposed to do
def demultiplex_run(self): """ Demultiplex a Xten run: - find the samplesheet - make a local copy of the samplesheet and name it SampleSheet.csv - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep) - run bcl2fastq conversion """ #we have 10x lane - need to split the samples sheet and build a 10x command for bcl2fastq Complex_run = False if len(self.lanes_10X) and len(self.lanes_not_10X): Complex_run = True if Complex_run: with chdir(self.run_dir): samplesheet_dest_not_10X = "SampleSheet_0.csv" with open(samplesheet_dest_not_10X, 'wb') as fcd: fcd.write( _generate_samplesheet_subset( self.runParserObj.samplesheet, self.lanes_not_10X)) samplesheet_dest_10X = "SampleSheet_1.csv" with open(samplesheet_dest_10X, 'wb') as fcd: fcd.write( _generate_samplesheet_subset( self.runParserObj.samplesheet, self.lanes_10X)) else: with chdir(self.run_dir): samplesheet_dest = "SampleSheet_0.csv" with open(samplesheet_dest, 'wb') as fcd: fcd.write( _generate_samplesheet_subset( self.runParserObj.samplesheet, (self.lanes_10X or self.lanes_not_10X))) per_lane_base_masks = self._generate_per_lane_base_mask() max_different_base_masks = max([ len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks ]) if max_different_base_masks > 1: # in a HiSeqX run I cannot have different index sizes in the SAME lane logger.error( "In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \ in the same lane".format(self.id)) return False bcl2fastq_cmd_counter = 0 with chdir(self.run_dir): # create Demultiplexing dir, this changes the status to IN_PROGRESS if not os.path.exists("Demultiplexing"): os.makedirs("Demultiplexing") with chdir(self.run_dir): if self.lanes_not_10X: cmd_normal = self.generate_bcl_command(self.lanes_not_10X, bcl2fastq_cmd_counter) misc.call_external_command_detached( cmd_normal, with_log_files=True, prefix="demux_{}".format(bcl2fastq_cmd_counter)) logger.info( ("BCL to FASTQ conversion and demultiplexing started for " "normal run {} on {}".format(os.path.basename(self.id), datetime.now()))) bcl2fastq_cmd_counter += 1 if self.lanes_10X: cmd_10X = self.generate_bcl_command(self.lanes_10X, bcl2fastq_cmd_counter, is_10X=True) misc.call_external_command_detached( cmd_10X, with_log_files=True, prefix="demux_{}".format(bcl2fastq_cmd_counter)) logger.info( ("BCL to FASTQ conversion and demultiplexing started for " "10X run {} on {}".format(os.path.basename(self.id), datetime.now()))) bcl2fastq_cmd_counter += 1 return True
def encrypt_runs(cls, run, force): """Encrypt the runs that have been collected""" bk = cls(run) bk.collect_runs(ext=".tar.gz") logger.info("In total, found {} run(s) to be encrypted".format( len(bk.runs))) for run in bk.runs: run.flag = "{}.encrypting".format(run.name) run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted) tmp_files = [ run.zip_encrypted, run.key_encrypted, run.key, run.flag ] logger.info("Encryption of run {} is now started".format(run.name)) # Check if there is enough space and exit if not bk.avail_disk_space(run.path, run.name) # Check if the run in demultiplexed if not force and bk.check_demux: if not misc.run_is_demuxed(run.name, bk.couch_info): logger.warn( "Run {} is not demultiplexed yet, so skipping it". format(run.name)) continue logger.info( "Run {} is demultiplexed and proceeding with encryption". format(run.name)) with filesystem.chdir(run.path): # skip run if already ongoing if os.path.exists(run.flag): logger.warn( "Run {} is already being encrypted, so skipping now". format(run.name)) continue flag = open(run.flag, 'w').close() # zip the run directory if os.path.exists(run.zip): if os.path.isdir(run.name): logger.warn( "Both run source and zipped archive exist for run {}, skipping run as precaution" .format(run.name)) bk._clean_tmp_files([run.flag]) continue logger.info( "Zipped archive already exist for run {}, so using it for encryption" .format(run.name)) else: logger.info("Creating zipped archive for run {}".format( run.name)) if bk._call_commands(cmd1="tar -cf - {}".format(run.name), cmd2="pigz --fast -c -", out_file=run.zip, mail_failed=True, tmp_files=[run.zip, run.flag]): logger.info( "Run {} was successfully compressed, so removing the run source directory" .format(run.name)) shutil.rmtree(run.name) else: logger.warn("Skipping run {} and moving on".format( run.name)) continue # Remove encrypted file if already exists if os.path.exists(run.zip_encrypted): logger.warn(( "Removing already existing encrypted file for run {}, this is a precaution " "to make sure the file was encrypted with correct key file" .format(run.name))) bk._clean_tmp_files([ run.zip_encrypted, run.key, run.key_encrypted, run.dst_key_encrypted ]) # Generate random key to use as pasphrase if not bk._call_commands(cmd1="gpg --gen-random 1 256", out_file=run.key, tmp_files=tmp_files): logger.warn("Skipping run {} and moving on".format( run.name)) continue logger.info("Generated randon phrase key for run {}".format( run.name)) # Calculate md5 sum pre encryption if not force: logger.info("Calculating md5sum before encryption") md5_call, md5_out = bk._call_commands( cmd1="md5sum {}".format(run.zip), return_out=True, tmp_files=tmp_files) if not md5_call: logger.warn("Skipping run {} and moving on".format( run.name)) continue md5_pre_encrypt = md5_out.split()[0] # Encrypt the zipped run file logger.info("Encrypting the zipped run file") if not bk._call_commands( cmd1= ("gpg --symmetric --cipher-algo aes256 --passphrase-file {} --batch --compress-algo " "none -o {} {}".format(run.key, run.zip_encrypted, run.zip)), tmp_files=tmp_files): logger.warn("Skipping run {} and moving on".format( run.name)) continue # Decrypt and check for md5 if not force: logger.info("Calculating md5sum after encryption") md5_call, md5_out = bk._call_commands( cmd1= "gpg --decrypt --cipher-algo aes256 --passphrase-file {} --batch {}" .format(run.key, run.zip_encrypted), cmd2="md5sum", return_out=True, tmp_files=tmp_files) if not md5_call: logger.warn("Skipping run {} and moving on".format( run.name)) continue md5_post_encrypt = md5_out.split()[0] if md5_pre_encrypt != md5_post_encrypt: logger.error(( "md5sum did not match before {} and after {} encryption. Will remove temp files and " "move on".format(md5_pre_encrypt, md5_post_encrypt))) bk._clean_tmp_files(tmp_files) continue logger.info( "Md5sum is macthing before and after encryption") # Encrypt and move the key file if bk._call_commands(cmd1="gpg -e -r {} -o {} {}".format( bk.gpg_receiver, run.key_encrypted, run.key), tmp_files=tmp_files): shutil.move(run.key_encrypted, run.dst_key_encrypted) else: logger.error("Encrption of key file failed, skipping run") continue bk._clean_tmp_files([run.zip, run.key, run.flag]) logger.info( "Encryption of run {} is successfully done, removing zipped run file" .format(run.name))
def demultiplex_run(self): """ Demultiplex a Xten run: - find the samplesheet - make a local copy of the samplesheet and name it SampleSheet.csv - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep) - run bcl2fastq conversion """ ssname = self._get_samplesheet() ssparser = SampleSheetParser(ssname) try: indexfile = self.CONFIG['bcl2fastq']['index_path'] except KeyError: logger.error( "Path to index file (10X) not found in the config file") raise RuntimeError #samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default) #if this is not the case then create it and take special care of modification to be done on the SampleSheet samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv") #Function that returns a list of which lanes contains 10X samples. (lanes_10X, lanes_not_10X) = look_for_lanes_with_10X_indicies( indexfile, ssparser) #check that the samplesheet is not already present. In this case go the next step if not os.path.exists(samplesheet_dest): try: with open(samplesheet_dest, 'wb') as fcd: fcd.write( _generate_clean_samplesheet( ssparser, indexfile, fields_to_remove=['index2'], rename_samples=True, rename_qPCR_suffix=True, fields_qPCR=[ssparser.dfield_snm])) except Exception as e: logger.error( "encountered the following exception '{}'".format(e)) return False logger.info( ("Created SampleSheet.csv for Flowcell {} in {} ".format( self.id, samplesheet_dest))) ##SampleSheet.csv generated ##when demultiplexing SampleSheet.csv is the one I need to use ## Need to rewrite so that SampleSheet_0.csv is always used. self.runParserObj.samplesheet = SampleSheetParser( os.path.join(self.run_dir, "SampleSheet.csv")) #we have 10x lane - need to split the samples sheet and build a 10x command for bcl2fastq Complex_run = False if len(lanes_10X) and len(lanes_not_10X): Complex_run = True if Complex_run: with chdir(self.run_dir): samplesheet_dest_not_10X = "SampleSheet_0.csv" with open(samplesheet_dest_not_10X, 'wb') as fcd: fcd.write( _generate_samplesheet_subset( self.runParserObj.samplesheet, lanes_not_10X)) samplesheet_dest_10X = "SampleSheet_1.csv" with open(samplesheet_dest_10X, 'wb') as fcd: fcd.write( _generate_samplesheet_subset( self.runParserObj.samplesheet, lanes_10X)) else: with chdir(self.run_dir): shutil.copy("SampleSheet.csv", "SampleSheet_0.csv") per_lane_base_masks = self._generate_per_lane_base_mask() max_different_base_masks = max([ len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks ]) if max_different_base_masks > 1: # in a HiSeqX run I cannot have different index sizes in the SAME lane logger.error( "In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \ in the same lane".format(self.id)) return False bcl2fastq_cmd_counter = 0 with chdir(self.run_dir): # create Demultiplexing dir, this changes the status to IN_PROGRESS if not os.path.exists("Demultiplexing"): os.makedirs("Demultiplexing") with chdir(self.run_dir): if lanes_not_10X: cmd_normal = self.generate_bcl_command(lanes_not_10X, bcl2fastq_cmd_counter) misc.call_external_command_detached( cmd_normal, with_log_files=True, prefix="demux_{}".format(bcl2fastq_cmd_counter)) logger.info( ("BCL to FASTQ conversion and demultiplexing started for " "normal run {} on {}".format(os.path.basename(self.id), datetime.now()))) bcl2fastq_cmd_counter += 1 if lanes_10X: cmd_10X = self.generate_bcl_command(lanes_10X, bcl2fastq_cmd_counter, is_10X=True) misc.call_external_command_detached( cmd_10X, with_log_files=True, prefix="demux_{}".format(bcl2fastq_cmd_counter)) logger.info( ("BCL to FASTQ conversion and demultiplexing started for " "10X run {} on {}".format(os.path.basename(self.id), datetime.now()))) bcl2fastq_cmd_counter += 1 return True
def compute_undetermined(self): """ This function returns true if all demux steps are done and we can proceed to QC For simple lanes with index: no check is done everything needs to be in place for complex lanes: no check is done everything needs to be in place for simple lanes and NoIndex: check if demux counts have been computed, if not compute or return waiting for thir completion """ NoIndexLanes = [lane["Lane"] for lane in self.runParserObj.samplesheet.data if "NoIndex" in lane["index"]] if len(NoIndexLanes) == 0: return True # everything is fine I can proceed to QC # otherwise proceed NoIndex_Undetermiend = os.path.join(self.run_dir, "Demultiplexing_NoIndex") if not os.path.exists(NoIndex_Undetermiend): # for these lanes I have no undetermiend as I demux them without index. # now geenrate the base masks per lane per_lane_base_masks = self._generate_per_lane_base_mask() # store here only the NoIndex lanes per_lane_base_masks_NoIndex = {} run_with_no_index = False # use this flag to check that we are not in the C.Daub case for NoIndexLane in NoIndexLanes: per_lane_base_masks_NoIndex[NoIndexLane] = per_lane_base_masks[NoIndexLane] base_mask_key = per_lane_base_masks[NoIndexLane].keys()[0] new_base_mask = [] if len(per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key]["base_mask"]): # C.Daub_15_01 case, only one sample per lane and no index at all run_with_no_index = True else: for baseMask_element in per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key]["base_mask"]: if baseMask_element.startswith("Y"): new_base_mask.append(baseMask_element.replace("Y", "N")) elif baseMask_element.startswith("N"): new_base_mask.append(baseMask_element.replace("N", "Y")) per_lane_base_masks_NoIndex[NoIndexLane][base_mask_key]["base_mask"] = new_base_mask if not run_with_no_index: os.makedirs(NoIndex_Undetermiend) command = self._generate_bcl2fastq_command( per_lane_base_masks_NoIndex, True, "NoIndex", mask_short_adapter_reads=True ) with chdir(self.run_dir): misc.call_external_command_detached(command, with_log_files=True, prefix="demux_NoIndex") # return false, as I need to wait to finish the demux for the NoIndex case return False else: # in this case I do not want to start a demux for th eindex, beceause I do not have the index at all # I need to softlink everythin else that is in Stats as I do not want to recompute it missingStatsFiles = glob.glob(os.path.join(self.run_dir, "Demultiplexing_0", "Stats", "*F*L*.txt")) destination = os.path.join(self.run_dir, self.demux_dir, "Stats") for source in missingStatsFiles: source_file_name = os.path.basename(source) if not os.path.exists(os.path.join(destination, source_file_name)): os.symlink(source, os.path.join(destination, source_file_name)) return True else: # in this case it means that I have already started to demux the NoIndex if not os.path.exists( os.path.join(self.run_dir, "Demultiplexing_NoIndex", "Stats", "DemultiplexingStats.xml") ): # demultiplexing of undetermined is still ongoing logger.info("Demux of NoIndex lanes ongoing") return False else: logger.info("Demux of NoIndex lanes done.") # now I need to produce the files needed in the QC flag_file = os.path.join(NoIndex_Undetermiend, "ongoing") if os.path.exists(flag_file): # it means that a previous instance of TACA is running and still processing this FC logger.info("Counting of undetermined indexes for NoIndex lanes ongoing") return False # now check if the stats have been already computed computed = True for lane_id in NoIndexLanes: demuxSummary_file = os.path.join( self.run_dir, self.demux_dir, "Stats", "DemuxSummaryF1L{}.txt".format(lane_id) ) if not os.path.exists(demuxSummary_file): # if does not exists and the ongoing falg is not present, then I need to set computed to False computed = False if computed: # in this case I already computed all the demux stats that I need return True # otherwise I need to comput them open(flag_file, "a").close() # create the flag file indicating I am working on this for lane_id in NoIndexLanes: # count the index occurences, each lane corresponds to one project, a project might have multiple lanes current_lane = [lane for lane in self.runParserObj.samplesheet.data if lane_id == lane["Lane"]][0] if current_lane["index"] != "NoIndex": logger.error( "while processing run {} NoIndex lane {}, index {} found in SampleSheet".format( self.id, lane_id, current_lane["index"] ) ) return False index_counter = {} indexes_fastq1 = glob.glob( os.path.join( NoIndex_Undetermiend, current_lane[self.runParserObj.samplesheet.dfield_proj], current_lane[self.runParserObj.samplesheet.dfield_sid], "{}_S?_L00{}_R2_001.fastq.gz".format( current_lane[self.runParserObj.samplesheet.dfield_snm], lane_id ), ) )[0] indexes_fastq2 = glob.glob( os.path.join( NoIndex_Undetermiend, current_lane[self.runParserObj.samplesheet.dfield_proj], current_lane[self.runParserObj.samplesheet.dfield_sid], "{}_S?_L00{}_R3_001.fastq.gz".format( current_lane[self.runParserObj.samplesheet.dfield_snm], lane_id ), ) )[0] # I assume these two files are always present, maybe it is posisble to have no index with a single index... logger.info("Computing Undetermiend indexes for NoIndex lane {}".format(lane_id)) zcat = subprocess.Popen(["zcat", indexes_fastq1], stdout=subprocess.PIPE) # this command allows to steam two files, print them line after line separated by a plus awk = subprocess.Popen( [ "awk", 'BEGIN {{OFS="+"}}{{ ("zcat " "{0} " ) | getline line ; print $0,line }}'.format( indexes_fastq2 ), ], stdout=subprocess.PIPE, stdin=zcat.stdout, ) # now select only the 2nd line every 4 (i.e., only the index1+index2 line) sed = subprocess.Popen(["sed", "-n", "2~4p"], stdout=subprocess.PIPE, stdin=awk.stdout) zcat.stdout.close() awk.stdout.close() output = sed.communicate()[0] zcat.wait() awk.wait() for barcode in output.split("\n")[:-1]: try: index_counter[barcode] += 1 except KeyError: index_counter[barcode] = 1 demuxSummary_file = os.path.join( self.run_dir, self.demux_dir, "Stats", "DemuxSummaryF1L{}.txt".format(lane_id) ) with open(demuxSummary_file, "w") as demuxSummary_file_fh: demuxSummary_file_fh.write("### Most Popular Unknown Index Sequences\n") demuxSummary_file_fh.write("### Columns: Index_Sequence Hit_Count\n") for (index, occ) in sorted(index_counter.items(), key=operator.itemgetter(1), reverse=True): demuxSummary_file_fh.write("{}\t{}\n".format(index, occ)) # I need to fill in the lane and laneBarcode html reports when I demux with NoIndex I do not create many values undeterminedStats = DemuxSummaryParser(os.path.join(self.run_dir, self.demux_dir, "Stats")) sample_data_old = self.runParserObj.lanes.sample_data sample_data_new = [] for lane in sample_data_old: if lane["Lane"] in NoIndexLanes: # in this case I need to fill in new values PF_clusters = undeterminedStats.TOTAL[lane["Lane"]] lane["% One mismatchbarcode"] = "0" lane["% Perfectbarcode"] = "100" lane["% of thelane"] = "100" lane["PF Clusters"] = str(PF_clusters) sample_data_new.append(lane) self.runParserObj.lanes.sample_data = sample_data_new demux_folder = os.path.join(self.run_dir, "Demultiplexing") new_html_report_lane_dir = _create_folder_structure( demux_folder, ["Reports", "html", self.flowcell_id, "all", "all", "all"] ) new_html_report_lane = os.path.join(new_html_report_lane_dir, "lane.html") _generate_lane_html(new_html_report_lane, self.runParserObj.lanes) # now do the same for laneBarcode sampleBarcode_data_old = self.runParserObj.lanebarcodes.sample_data sampleBarcode_data_new = [] for sample in sampleBarcode_data_old: if sample["Lane"] in NoIndexLanes: # in this case I need to fill in new values PF_clusters = undeterminedStats.TOTAL[lane["Lane"]] sample["% One mismatchbarcode"] = "0" sample["% Perfectbarcode"] = "100" sample["% of thelane"] = "100" sample["PF Clusters"] = str(PF_clusters) sampleBarcode_data_new.append(sample) self.runParserObj.lanebarcodes.sample_data = sampleBarcode_data_new demux_folder = os.path.join(self.run_dir, "Demultiplexing") new_html_report_sampleBarcode_dir = _create_folder_structure( demux_folder, ["Reports", "html", self.flowcell_id, "all", "all", "all"] ) new_html_report_sampleBarcode = os.path.join(new_html_report_sampleBarcode_dir, "laneBarcode.html") _generate_lane_html(new_html_report_sampleBarcode, self.runParserObj.lanebarcodes) os.remove(flag_file) # remove flag file to allow future iteration on this FC return True # return true, I have done everything I was supposed to do
def demultiplex_run(self): """ Demultiplex a HiSeq run: - find the samplesheet - make a local copy of the samplesheet and name it SampleSheet.csv - create multiple SampleSheets in case at least one lane have multiple indexes lengths - run bcl2fastq conversion """ ssname = self._get_samplesheet() if ssname is None: return None ssparser = SampleSheetParser(ssname) # Copy the original samplesheet locally. Copy again if already done as there might have been changes to the samplesheet try: shutil.copy(ssname, os.path.join(self.run_dir, "{}.csv".format(self.flowcell_id))) ssname = os.path.join(self.run_dir, os.path.split(ssname)[1]) except: raise RuntimeError("unable to copy file {} to destination {}".format(ssname, self.run_dir)) # this sample sheet has been created by the LIMS and copied by a sequencing operator. It is not ready # to be used it needs some editing # this will contain the samplesheet with all the renaiming to be used with bcl2fastq-2.17 samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv") # check that the samplesheet is not already present. In this case go the next step if os.path.exists(samplesheet_dest): logger.info("SampleSheet.csv found ... overwriting it") try: with open(samplesheet_dest, "wb") as fcd: fcd.write(self._generate_clean_samplesheet(ssparser)) except Exception as e: logger.error(e.text) return False logger.info(("Created SampleSheet.csv for Flowcell {} in {} ".format(self.id, samplesheet_dest))) ##SampleSheet.csv generated ##when demultiplexing SampleSheet.csv is the one I need to use self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, "SampleSheet.csv")) # now geenrate the base masks per lane and decide how to demultiplex per_lane_base_masks = self._generate_per_lane_base_mask() max_different_base_masks = max([len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks]) # if max_different is one, then I have a simple config and I can run a single command. Otherwirse I need to run multiples instances # extract lanes with a single base masks simple_lanes = {} complex_lanes = {} for lane in per_lane_base_masks: if len(per_lane_base_masks[lane]) == 1: simple_lanes[lane] = per_lane_base_masks[lane] else: complex_lanes[lane] = per_lane_base_masks[lane] # simple lanes contains the lanes such that there is more than one base mask bcl2fastq_commands = [] bcl2fastq_command_num = 0 if len(simple_lanes) > 0: bcl2fastq_commands.append(self._generate_bcl2fastq_command(simple_lanes, True, bcl2fastq_command_num)) bcl2fastq_command_num += 1 # compute the different masks, there will be one bcl2fastq command per mask base_masks_complex = [complex_lanes[base_masks].keys() for base_masks in complex_lanes] different_masks = list(set([item for sublist in base_masks_complex for item in sublist])) for mask in different_masks: base_masks_complex_to_demux = {} for lane in complex_lanes: if complex_lanes[lane].has_key(mask): base_masks_complex_to_demux[lane] = {} base_masks_complex_to_demux[lane][mask] = complex_lanes[lane][mask] # at this point base_masks_complex_to_demux contains only a base mask for lane. I can build the command bcl2fastq_commands.append( self._generate_bcl2fastq_command(base_masks_complex_to_demux, True, bcl2fastq_command_num) ) bcl2fastq_command_num += 1 # now bcl2fastq_commands contains all command to be executed. They can be executed in parallel, however run only one per time in order to avoid to overload the machine with chdir(self.run_dir): # create Demultiplexing dir, in this way the status of this run will became IN_PROGRESS if not os.path.exists("Demultiplexing"): os.makedirs("Demultiplexing") execution = 0 for bcl2fastq_command in bcl2fastq_commands: misc.call_external_command_detached( bcl2fastq_command, with_log_files=True, prefix="demux_{}".format(execution) ) execution += 1
def demultiplex_run(self): """ Demultiplex a Xten run: - find the samplesheet - make a local copy of the samplesheet and name it SampleSheet.csv - define if necessary the bcl2fastq commands (if indexes are not of size 8, i.e. neoprep) - run bcl2fastq conversion """ ssname = self._get_samplesheet() ssparser = SampleSheetParser(ssname) #samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default) #if this is not the case then create it and take special care of modification to be done on the SampleSheet samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv") #check that the samplesheet is not already present. In this case go the next step if not os.path.exists(samplesheet_dest): try: with open(samplesheet_dest, 'wb') as fcd: fcd.write( _generate_clean_samplesheet( ssparser, fields_to_remove=['index2'], rename_samples=True, rename_qPCR_suffix=True, fields_qPCR=[ssparser.dfield_snm])) except Exception as e: logger.error(e.text) return False logger.info( ("Created SampleSheet.csv for Flowcell {} in {} ".format( self.id, samplesheet_dest))) ##SampleSheet.csv generated ##when demultiplexing SampleSheet.csv is the one I need to use self.runParserObj.samplesheet = SampleSheetParser( os.path.join(self.run_dir, "SampleSheet.csv")) per_lane_base_masks = self._generate_per_lane_base_mask() max_different_base_masks = max([ len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks ]) if max_different_base_masks > 1: # in a HiSeqX run I cannot have different index sizes in the SAME lane logger.error( "In FC {} found one or more lane with more than one base mask (i.e., different index sizes in \ in the same lane".format(self.id)) return False #I have everything to run demultiplexing now. logger.info('Building bcl2fastq command') with chdir(self.run_dir): cl = [self.CONFIG.get('bcl2fastq')['bin']] if self.CONFIG.get('bcl2fastq').has_key('options'): cl_options = self.CONFIG['bcl2fastq']['options'] # Append all options that appear in the configuration file to the main command. for option in cl_options: if isinstance(option, dict): opt, val = option.items()[0] cl.extend(['--{}'.format(opt), str(val)]) else: cl.append('--{}'.format(option)) #now add the base_mask for each lane for lane in sorted(per_lane_base_masks): #iterate thorugh each lane and add the correct --use-bases-mask for that lane #there is a single basemask for each lane, I checked it a couple of lines above base_mask = [ per_lane_base_masks[lane][bm]['base_mask'] for bm in per_lane_base_masks[lane] ][0] # get the base_mask base_mask_expr = "{}:".format(lane) + ",".join(base_mask) cl.extend(["--use-bases-mask", base_mask_expr]) logger.info( ("BCL to FASTQ conversion and demultiplexing started for " " run {} on {}".format(os.path.basename(self.id), datetime.now()))) misc.call_external_command_detached(cl, with_log_files=True) return True
def cleanup_irma(days_fastq, days_analysis, only_fastq, only_analysis, status_db_config, dry_run=False): """Remove fastq/analysis data for projects that have been closed more than given days (as days_fastq/days_analysis) from the given 'irma' cluster :param int days_fastq: Days to consider to remove fastq files for project :param int days_analysis: Days to consider to remove analysis data for project :param bool only_fastq: Remove only fastq files for closed projects :param bool only_analysis: Remove only analysis data for closed projects :param bool dry_run: Will summarize what is going to be done without really doing it Example for mat for config file cleanup: irma: flowcell: ##this path is nothing but incoming directory, can given multiple paths root: - path/to/flowcells_dir relative_project_source: Demultiplexing ##this is path where projects are organized data_dir: path/to/data_dir analysis: ##directory where analysis are perfoemed for projects root: path/to/analysis_dir #should be exactly same as the qc folder name and files wished to be removed files_to_remove: piper_ngi: - "*.bam" """ try: config = CONFIG['cleanup']['irma'] flowcell_dir_root = config['flowcell']['root'] flowcell_project_source = config['flowcell']['relative_project_source'] data_dir = config['data_dir'] analysis_dir = config['analysis']['root'] analysis_data_to_remove = config['analysis']['files_to_remove'] except KeyError as e: logger.error("Config file is missing the key {}, make sure it have all required information".format(str(e))) raise SystemExit # make a connection for project db # pcon = statusdb.ProjectSummaryConnection(conf=status_db_config) assert pcon, "Could not connect to project database in StatusDB" #compile list for project to delete project_clean_list, project_processed_list = ({}, []) logger.info("Building initial project list for removing data..") if only_fastq: logger.info("Option 'only_fastq' is given, so will not look for analysis data") elif only_analysis: logger.info("Option 'only_analysis' is given, so will not look for fastq data") if only_analysis: for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \ not os.path.exists(os.path.join(analysis_dir, d, "cleaned"))]: proj_abs_path = os.path.join(analysis_dir, pid) proj_info = get_closed_proj_info(pid, pcon.get_entry(pid, use_id_view=True)) if proj_info and proj_info['closed_days'] >= days_analysis: analysis_data, analysis_size = collect_analysis_data_irma(pid, analysis_dir, analysis_data_to_remove) proj_info['analysis_to_remove'] = analysis_data proj_info['analysis_size'] = analysis_size proj_info['fastq_to_remove'] = "not_selected" proj_info['fastq_size'] = 0 project_clean_list[proj_info['name']] = proj_info else: for flowcell_dir in flowcell_dir_root: for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]: fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \ not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))] for _proj in projects_in_fc: proj = re.sub(r'_+', '.', _proj, 1) # if a project is already processed no need of fetching it again from status db if proj in project_processed_list: # if the project is closed more than threshold days collect the fastq files from FC # no need of looking for analysis data as they would have been collected in the first time if proj in project_clean_list and project_clean_list[proj]['closed_days'] >= days_fastq: fc_fq_files, fq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj)) project_clean_list[proj]['fastq_to_remove']['flowcells'][fc] = fc_fq_files['flowcells'][fc] project_clean_list[proj]['fastq_size'] += fq_size continue project_processed_list.append(proj) #by default assume all projects are not old enough for delete fastq_data, analysis_data = ("young", "young") fastq_size, analysis_size = (0, 0) proj_info = get_closed_proj_info(proj, pcon.get_entry(proj)) if proj_info: # if project not old enough for fastq files and only fastq files selected move on to next project if proj_info['closed_days'] >= days_fastq: fastq_data, fastq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj), data_dir, proj_info['pid']) if not only_fastq: # if project is old enough for fastq files and not 'only_fastq' try collect analysis files if proj_info['closed_days'] >= days_analysis: analysis_data, analysis_size = collect_analysis_data_irma(proj_info['pid'], analysis_dir, analysis_data_to_remove) # if both fastq and analysis files are not old enough move on if (analysis_data == fastq_data) or ((not analysis_data or analysis_data == "cleaned") and fastq_data == "young"): continue elif fastq_data == "young": continue else: analysis_data = "not_selected" proj_info['fastq_to_remove'] = fastq_data proj_info['fastq_size'] = fastq_size proj_info['analysis_to_remove'] = analysis_data proj_info['analysis_size'] = analysis_size project_clean_list[proj] = proj_info if not project_clean_list: logger.info("There are no projects to clean") return get_files_size_text(project_clean_list) logger.info("Initial list is built with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list))) if misc.query_yes_no("Interactively filter projects for cleanup ?", default="yes"): filtered_project, proj_count = ([], 0) #go through complied project list and remove files for proj, info in project_clean_list.iteritems(): proj_count += 1 if not misc.query_yes_no("{}Delete files for this project ({}/{})".format(get_proj_meta_info(info, days_fastq), proj_count, len(project_clean_list)), default="no"): logger.info("Will not remove files for project {}".format(proj)) filtered_project.append(proj) # remove projects that were decided not to delete map(project_clean_list.pop, filtered_project) logger.info("Removed {}/{} projects from initial list".format(len(filtered_project), proj_count)) if not project_clean_list: logger.info("There are no projects to clean after filtering") return logger.info("Final list is created with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list))) if not misc.query_yes_no("Proceed with cleanup ?", default="no"): logger.info("Aborting cleanup") return logger.info("Will start cleaning up project now") for proj, info in project_clean_list.iteritems(): fastq_info = info.get('fastq_to_remove') if fastq_info and isinstance(fastq_info, dict): logger.info("Cleaning fastq files for project {}".format(proj)) fastq_fc = fastq_info.get('flowcells', {}) removed_fc = [] for fc, fc_info in fastq_fc.iteritems(): proj_fc_root = fc_info['proj_root'] logger.info("Removing fastq files from {}".format(proj_fc_root)) if not dry_run: if _remove_files(fc_info['fq_files']): logger.info("Removed fastq files from FC {} for project {}, marking it as cleaned".format(fc, proj)) _touch_cleaned(proj_fc_root) removed_fc.append(fc) if len(fastq_fc) == len(removed_fc): try: proj_data_root = fastq_info['proj_data']['proj_data_root'] logger.info("All flowcells cleaned for this project, marking it as cleaned in {}".format(proj_data_root)) _touch_cleaned(proj_data_root) except: pass analysis_info = info.get('analysis_to_remove') if analysis_info and isinstance(analysis_info, dict): proj_analysis_root = analysis_info['proj_analysis_root'] logger.info("cleaning analysis data for project {}".format(proj)) removed_qc = [] for qc, files in analysis_info['analysis_files'].iteritems(): logger.info("Removing files of '{}' from {}".format(qc, proj_analysis_root)) if not dry_run: if _remove_files(files): removed_qc.append(qc) else: logger.warn("Couldn't remove some files in qc directory '{}'".format(qc)) map(analysis_info['analysis_files'].pop, removed_qc) if len(analysis_info['analysis_files']) == 0: logger.info("Removed analysis data for project {}, marking it cleaned".format(proj)) _touch_cleaned(proj_analysis_root)
def demultiplex_run(self): """ Demultiplex a HiSeq run: - find the samplesheet - make a local copy of the samplesheet and name it SampleSheet.csv - create multiple SampleSheets in case at least one lane have multiple indexes lengths - run bcl2fastq conversion """ #now geenrate the base masks per lane and decide how to demultiplex per_lane_base_masks = self._generate_per_lane_base_mask() max_different_base_masks = max([ len(per_lane_base_masks[base_masks]) for base_masks in per_lane_base_masks ]) #if max_different is one, then I have a simple config and I can run a single command. Otherwirse I need to run multiples instances #extract lanes with a single base masks simple_lanes = {} complex_lanes = {} for lane in per_lane_base_masks: if len(per_lane_base_masks[lane]) == 1: simple_lanes[lane] = per_lane_base_masks[lane] else: complex_lanes[lane] = per_lane_base_masks[lane] #simple lanes contains the lanes such that there is more than one base mask bcl2fastq_commands = [] bcl2fastq_command_num = 0 if len(simple_lanes) > 0: bcl2fastq_commands.append( self._generate_bcl2fastq_command(simple_lanes, True, bcl2fastq_command_num)) bcl2fastq_command_num += 1 #compute the different masks, there will be one bcl2fastq command per mask base_masks_complex = [ complex_lanes[base_masks].keys() for base_masks in complex_lanes ] different_masks = list( set([item for sublist in base_masks_complex for item in sublist])) for mask in different_masks: base_masks_complex_to_demux = {} for lane in complex_lanes: if complex_lanes[lane].has_key(mask): base_masks_complex_to_demux[lane] = {} base_masks_complex_to_demux[lane][mask] = complex_lanes[ lane][mask] #at this point base_masks_complex_to_demux contains only a base mask for lane. I can build the command bcl2fastq_commands.append( self._generate_bcl2fastq_command(base_masks_complex_to_demux, True, bcl2fastq_command_num)) bcl2fastq_command_num += 1 #now bcl2fastq_commands contains all command to be executed. They can be executed in parallel, however run only one per time in order to avoid to overload the machine with chdir(self.run_dir): # create Demultiplexing dir, in this way the status of this run will became IN_PROGRESS if not os.path.exists("Demultiplexing"): os.makedirs("Demultiplexing") execution = 0 for bcl2fastq_command in bcl2fastq_commands: misc.call_external_command_detached( bcl2fastq_command, with_log_files=True, prefix="demux_{}".format(execution)) execution += 1
def cleanup_irma(days_fastq, days_analysis, only_fastq, only_analysis, clean_undetermined, status_db_config, exclude_projects, list_only, date, dry_run=False): """Remove fastq/analysis data for projects that have been closed more than given days (as days_fastq/days_analysis) from the given 'irma' cluster :param int days_fastq: Days to consider to remove fastq files for project :param int days_analysis: Days to consider to remove analysis data for project :param bool only_fastq: Remove only fastq files for closed projects :param bool only_analysis: Remove only analysis data for closed projects :param bool dry_run: Will summarize what is going to be done without really doing it Example for mat for config file cleanup: irma: flowcell: ##this path is nothing but incoming directory, can given multiple paths root: - path/to/flowcells_dir relative_project_source: Demultiplexing undet_file_pattern: "Undetermined_*.fastq.gz" ##this is path where projects are organized data_dir: path/to/data_dir analysis: ##directory where analysis are perfoemed for projects root: path/to/analysis_dir #should be exactly same as the qc folder name and files wished to be removed files_to_remove: piper_ngi: - "*.bam" """ try: config = CONFIG['cleanup']['irma'] flowcell_dir_root = config['flowcell']['root'] flowcell_project_source = config['flowcell']['relative_project_source'] flowcell_undet_files = config['flowcell']['undet_file_pattern'] data_dir = config['data_dir'] analysis_dir = config['analysis']['root'] analysis_data_to_remove = config['analysis']['files_to_remove'] if date: date = datetime.strptime(date, '%Y-%m-%d') except KeyError as e: logger.error("Config file is missing the key {}, make sure it have all required information".format(str(e))) raise SystemExit except ValueError as e: logger.error("Date given with '--date' option is not in required format, see help for more info") raise SystemExit # make a connection for project db # pcon = statusdb.ProjectSummaryConnection(conf=status_db_config) assert pcon, "Could not connect to project database in StatusDB" # make exclude project list if provided exclude_list = [] if exclude_projects: if os.path.isfile(exclude_projects): with open(exclude_projects, 'r') as in_file: exclude_list.extend([p.strip() for p in in_file.readlines()]) else: exclude_list.extend(exclude_projects.split(',')) # sanity check for mentioned project to exculde or valid invalid_projects = filter(lambda p: p not in pcon.id_view.keys() and p not in pcon.name_view.keys(), exclude_list) if invalid_projects: logger.error("'--exclude_projects' was called with some invalid projects '{}', " "provide valid project name/id".format(",".join(invalid_projects))) raise SystemExit #compile list for project to delete project_clean_list, project_processed_list = ({}, []) if not list_only and not clean_undetermined: logger.info("Building initial project list for removing data..") if only_fastq: logger.info("Option 'only_fastq' is given, so will not look for analysis data") elif only_analysis: logger.info("Option 'only_analysis' is given, so will not look for fastq data") if clean_undetermined: all_undet_files = [] for flowcell_dir in flowcell_dir_root: for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]: fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): if not os.path.exists(flowcell_project_source): logger.warn("Flowcell {} do not contain a '{}' direcotry".format(fc, flowcell_project_source)) continue projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \ not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))] # the above check looked for project directories and also that are not cleaned # so if it could not find any project, means there is no project diretory at all # or all the project directory is already cleaned. Then we can remove the undet if len(projects_in_fc) > 0: continue fc_undet_files = glob(os.path.join(flowcell_project_source,flowcell_undet_files)) if fc_undet_files: logger.info("All projects was cleaned for FC {}, found {} undeterminded files".format(fc,len(fc_undet_files))) all_undet_files.extend(map(os.path.abspath, fc_undet_files)) if all_undet_files: undet_size = _def_get_size_unit(sum(map(os.path.getsize, all_undet_files))) if misc.query_yes_no("In total found {} undetermined files which are {} in size, delete now ?".format(len(all_undet_files), undet_size), default="no"): removed = _remove_files(all_undet_files) return elif only_analysis: for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \ not os.path.exists(os.path.join(analysis_dir, d, "cleaned"))]: proj_abs_path = os.path.join(analysis_dir, pid) proj_info = get_closed_proj_info(pid, pcon.get_entry(pid, use_id_view=True), date) if proj_info and proj_info['closed_days'] >= days_analysis: # move on if this project has to be excluded if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list: continue analysis_data, analysis_size = collect_analysis_data_irma(pid, analysis_dir, analysis_data_to_remove) proj_info['analysis_to_remove'] = analysis_data proj_info['analysis_size'] = analysis_size proj_info['fastq_to_remove'] = "not_selected" proj_info['fastq_size'] = 0 project_clean_list[proj_info['name']] = proj_info else: for flowcell_dir in flowcell_dir_root: for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]: fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): if not os.path.exists(flowcell_project_source): logger.warn("Flowcell {} do not contain a '{}' direcotry".format(fc, flowcell_project_source)) continue projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ if re.match(r'^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$',d) and \ not os.path.exists(os.path.join(flowcell_project_source, d, "cleaned"))] for _proj in projects_in_fc: proj = re.sub(r'_+', '.', _proj, 1) # if a project is already processed no need of fetching it again from status db if proj in project_processed_list: # if the project is closed more than threshold days collect the fastq files from FC # no need of looking for analysis data as they would have been collected in the first time if proj in project_clean_list and project_clean_list[proj]['closed_days'] >= days_fastq: fc_fq_files, fq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj)) project_clean_list[proj]['fastq_to_remove']['flowcells'][fc] = fc_fq_files['flowcells'][fc] project_clean_list[proj]['fastq_size'] += fq_size continue project_processed_list.append(proj) #by default assume all projects are not old enough for delete fastq_data, analysis_data = ("young", "young") fastq_size, analysis_size = (0, 0) proj_info = get_closed_proj_info(proj, pcon.get_entry(proj), date) if proj_info: # move on if this project has to be excluded if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list: continue # if project not old enough for fastq files and only fastq files selected move on to next project if proj_info['closed_days'] >= days_fastq: fastq_data, fastq_size = collect_fastq_data_irma(fc_abs_path, os.path.join(flowcell_project_source, _proj), data_dir, proj_info['pid']) if not only_fastq: # if project is old enough for fastq files and not 'only_fastq' try collect analysis files if proj_info['closed_days'] >= days_analysis: analysis_data, analysis_size = collect_analysis_data_irma(proj_info['pid'], analysis_dir, analysis_data_to_remove) # if both fastq and analysis files are not old enough move on if (analysis_data == fastq_data) or ((not analysis_data or analysis_data == "cleaned") and fastq_data == "young"): continue elif fastq_data == "young": continue else: analysis_data = "not_selected" proj_info['fastq_to_remove'] = fastq_data proj_info['fastq_size'] = fastq_size proj_info['analysis_to_remove'] = analysis_data proj_info['analysis_size'] = analysis_size project_clean_list[proj] = proj_info if not project_clean_list: logger.info("There are no projects to clean") return # list only the project and exit if 'list_only' option is selected if list_only: print "Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size" for p_info in sorted(project_clean_list.values(), key=lambda d: d['closed_days'], reverse=True): print "\t".join([p_info['name'], p_info['pid'], p_info['bioinfo_responsible'], str(p_info['closed_days']), p_info['closed_date'], _def_get_size_unit(p_info['fastq_size']), _def_get_size_unit(p_info['analysis_size'])]) raise SystemExit logger.info("Initial list is built with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list))) if misc.query_yes_no("Interactively filter projects for cleanup ?", default="yes"): filtered_project, proj_count = ([], 0) #go through complied project list and remove files for proj, info in project_clean_list.iteritems(): proj_count += 1 if not misc.query_yes_no("{}Delete files for this project ({}/{})".format(get_proj_meta_info(info, days_fastq), proj_count, len(project_clean_list)), default="no"): logger.info("Will not remove files for project {}".format(proj)) filtered_project.append(proj) # remove projects that were decided not to delete map(project_clean_list.pop, filtered_project) logger.info("Removed {}/{} projects from initial list".format(len(filtered_project), proj_count)) if not project_clean_list: logger.info("There are no projects to clean after filtering") return logger.info("Final list is created with {} projects {}".format(len(project_clean_list), get_files_size_text(project_clean_list))) if not misc.query_yes_no("Proceed with cleanup ?", default="no"): logger.info("Aborting cleanup") return logger.info("Will start cleaning up project now") for proj, info in project_clean_list.iteritems(): fastq_info = info.get('fastq_to_remove') if fastq_info and isinstance(fastq_info, dict): logger.info("Cleaning fastq files for project {}".format(proj)) fastq_fc = fastq_info.get('flowcells', {}) removed_fc = [] for fc, fc_info in fastq_fc.iteritems(): proj_fc_root = fc_info['proj_root'] logger.info("Removing fastq files from {}".format(proj_fc_root)) if not dry_run: if _remove_files(fc_info['fq_files']): logger.info("Removed fastq files from FC {} for project {}, marking it as cleaned".format(fc, proj)) _touch_cleaned(proj_fc_root) removed_fc.append(fc) if len(fastq_fc) == len(removed_fc): try: proj_data_root = fastq_info['proj_data']['proj_data_root'] logger.info("All flowcells cleaned for this project, marking it as cleaned in {}".format(proj_data_root)) _touch_cleaned(proj_data_root) except: pass analysis_info = info.get('analysis_to_remove') if analysis_info and isinstance(analysis_info, dict): proj_analysis_root = analysis_info['proj_analysis_root'] logger.info("cleaning analysis data for project {}".format(proj)) removed_qc = [] for qc, files in analysis_info['analysis_files'].iteritems(): logger.info("Removing files of '{}' from {}".format(qc, proj_analysis_root)) if not dry_run: if _remove_files(files): removed_qc.append(qc) else: logger.warn("Couldn't remove some files in qc directory '{}'".format(qc)) map(analysis_info['analysis_files'].pop, removed_qc) if len(analysis_info['analysis_files']) == 0: logger.info("Removed analysis data for project {}, marking it cleaned".format(proj)) _touch_cleaned(proj_analysis_root)
def encrypt_runs(cls, run, force): """Encrypt the runs that have been collected""" bk = cls(run) bk.collect_runs(ext=".tar.gz") logger.info("In total, found {} run(s) to be encrypted".format(len(bk.runs))) for run in bk.runs: run.flag = "{}.encrypting".format(run.name) run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted) tmp_files = [run.zip_encrypted, run.key_encrypted, run.key, run.flag] logger.info("Encryption of run {} is now started".format(run.name)) # Check if there is enough space and exit if not bk.avail_disk_space(run.path, run.name) # Check if the run in demultiplexed if not force and bk.check_demux: if not misc.run_is_demuxed(run.name, bk.couch_info): logger.warn("Run {} is not demultiplexed yet, so skipping it".format(run.name)) continue logger.info("Run {} is demultiplexed and proceeding with encryption".format(run.name)) with filesystem.chdir(run.path): # skip run if already ongoing if os.path.exists(run.flag): logger.warn("Run {} is already being encrypted, so skipping now".format(run.name)) continue flag = open(run.flag, 'w').close() # zip the run directory if os.path.exists(run.zip): if os.path.isdir(run.name): logger.warn("Both run source and zipped archive exist for run {}, skipping run as precaution".format(run.name)) bk._clean_tmp_files([run.flag]) continue logger.info("Zipped archive already exist for run {}, so using it for encryption".format(run.name)) else: logger.info("Creating zipped archive for run {}".format(run.name)) if bk._call_commands(cmd1="tar -cf - {}".format(run.name), cmd2="pigz --fast -c -", out_file=run.zip, mail_failed=True, tmp_files=[run.zip, run.flag]): logger.info("Run {} was successfully compressed, so removing the run source directory".format(run.name)) shutil.rmtree(run.name) else: logger.warn("Skipping run {} and moving on".format(run.name)) continue # Remove encrypted file if already exists if os.path.exists(run.zip_encrypted): logger.warn(("Removing already existing encrypted file for run {}, this is a precaution " "to make sure the file was encrypted with correct key file".format(run.name))) bk._clean_tmp_files([run.zip_encrypted, run.key, run.key_encrypted, run.dst_key_encrypted]) # Generate random key to use as pasphrase if not bk._call_commands(cmd1="gpg --gen-random 1 256", out_file=run.key, tmp_files=tmp_files): logger.warn("Skipping run {} and moving on".format(run.name)) continue logger.info("Generated randon phrase key for run {}".format(run.name)) # Calculate md5 sum pre encryption if not force: logger.info("Calculating md5sum before encryption") md5_call, md5_out = bk._call_commands(cmd1="md5sum {}".format(run.zip), return_out=True, tmp_files=tmp_files) if not md5_call: logger.warn("Skipping run {} and moving on".format(run.name)) continue md5_pre_encrypt = md5_out.split()[0] # Encrypt the zipped run file logger.info("Encrypting the zipped run file") if not bk._call_commands(cmd1=("gpg --symmetric --cipher-algo aes256 --passphrase-file {} --batch --compress-algo " "none -o {} {}".format(run.key, run.zip_encrypted, run.zip)), tmp_files=tmp_files): logger.warn("Skipping run {} and moving on".format(run.name)) continue # Decrypt and check for md5 if not force: logger.info("Calculating md5sum after encryption") md5_call, md5_out = bk._call_commands(cmd1="gpg --decrypt --cipher-algo aes256 --passphrase-file {} --batch {}".format(run.key, run.zip_encrypted), cmd2="md5sum", return_out=True, tmp_files=tmp_files) if not md5_call: logger.warn("Skipping run {} and moving on".format(run.name)) continue md5_post_encrypt = md5_out.split()[0] if md5_pre_encrypt != md5_post_encrypt: logger.error(("md5sum did not match before {} and after {} encryption. Will remove temp files and " "move on".format(md5_pre_encrypt, md5_post_encrypt))) bk._clean_tmp_files(tmp_files) continue logger.info("Md5sum is macthing before and after encryption") # Encrypt and move the key file if bk._call_commands(cmd1="gpg -e -r {} -o {} {}".format(bk.gpg_receiver, run.key_encrypted, run.key), tmp_files=tmp_files): shutil.move(run.key_encrypted, run.dst_key_encrypted) else: logger.error("Encrption of key file failed, skipping run") continue bk._clean_tmp_files([run.zip, run.key, run.flag]) logger.info("Encryption of run {} is successfully done, removing zipped run file".format(run.name))
def demultiplex_run(self): """ Demultiplex a run: - Make sub-samplesheet based on sample classes - Decide correct bcl2fastq command parameters based on sample classes - run bcl2fastq conversion """ # Check sample types sample_type_list = [] for lane, lane_contents in self.sample_table.items(): for sample in lane_contents: sample_detail = sample[1] sample_type = sample_detail['sample_type'] if sample_type not in sample_type_list: sample_type_list.append(sample_type) # Go through sample_table for demultiplexing bcl2fastq_cmd_counter = 0 for sample_type in sorted(sample_type_list): # Looking for lanes with multiple masks under the same sample type lane_table = dict() for lane, lane_contents in self.sample_table.items(): for sample in lane_contents: sample_detail = sample[1] sample_type_t = sample_detail['sample_type'] sample_index_length = sample_detail['index_length'] if sample_type_t == sample_type: if lane_table.get(lane): if sample_index_length not in lane_table[lane]: lane_table[lane].append(sample_index_length) else: lane_table.update({lane: [sample_index_length]}) # Determine the number of demux needed for the same sample type demux_number_with_the_same_sample_type = len( max([v for k, v in lane_table.items()], key=len)) # Prepare sub-samplesheets, masks and commands for i in range(0, demux_number_with_the_same_sample_type): # Prepare sub-samplesheet # A dictionary with lane and sample IDs to include samples_to_include = dict() # A dictionary with lane and index length for generating masks mask_table = dict() for lane, lane_contents in self.sample_table.items(): try: index_length = lane_table[lane][i] mask_table.update({lane: index_length}) for sample in lane_contents: sample_name = sample[0] sample_detail = sample[1] sample_type_t = sample_detail['sample_type'] sample_index_length = sample_detail['index_length'] if sample_type_t == sample_type and sample_index_length == index_length: if samples_to_include.get(lane): samples_to_include[lane].append( sample_name) else: samples_to_include.update( {lane: [sample_name]}) except (KeyError, IndexError) as err: logger.info( ('No corresponding mask in lane {}. Skip it.'. format(lane))) continue # Make sub-samplesheet with chdir(self.run_dir): samplesheet_dest = 'SampleSheet_{}.csv'.format( bcl2fastq_cmd_counter) with open(samplesheet_dest, 'w') as fcd: fcd.write( _generate_samplesheet_subset( self.runParserObj.samplesheet, samples_to_include)) # Prepare demultiplexing dir with chdir(self.run_dir): # Create Demultiplexing dir, this changes the status to IN_PROGRESS if not os.path.exists('Demultiplexing'): os.makedirs('Demultiplexing') # Prepare demultiplexing command with chdir(self.run_dir): cmd = self.generate_bcl_command(sample_type, mask_table, bcl2fastq_cmd_counter) misc.call_external_command_detached( cmd, with_log_files=True, prefix='demux_{}'.format(bcl2fastq_cmd_counter)) logger.info(('BCL to FASTQ conversion and demultiplexing ' \ 'started for run {} on {}'.format(os.path.basename(self.id), datetime.now()))) # Demutiplexing done for one mask type and scripts will continue # Working with the next type. Command counter should increase by 1 bcl2fastq_cmd_counter += 1 return True
def test_chdir(self): """Ensure start dir and end dir are the same.""" initial_dir = os.getcwd() filesystem.chdir(self.rootdir) final_dir = os.getcwd() self.assertEqual(initial_dir, final_dir)