def find_fastq_parent_dir(sequencer_output_path): """ Searches for the subdirectory containing .fastq.gz files. If .fastq.gz files are not found two levels deep in the passed ``sequencer_output_path``, then a subdirectory containing .fastq.gz files will be returned, if found. Otherwise, raises an exception if no .fastq.gz files could be found. Parameters ---------- sequencer_output_path: str path to directory to search Returns ------- str path to the directory with .fastq files to use for the ``sns`` analysis Notes ----- Ignores 'Undetermined' .fastq files """ inclusion_patterns = ('*.fastq.gz',) exclusion_patterns = ('*Undetermined*',) # search just the top 2 levels matches = [] matches = find.find(search_dir = sequencer_output_path, inclusion_patterns = inclusion_patterns, exclusion_patterns = exclusion_patterns, search_type = 'file', num_limit = 1, level_limit = 2, match_mode = "any") if len(matches) > 0: logger.debug('Found .fastq files near the top level of the sequencer_output_path, returning sequencer_output_path: {0}'.format(sequencer_output_path)) return(sequencer_output_path) # search deeper; this might take a while logger.debug('.fastq files were not found near the top level of the sequencer_output_path, searching deeper...') matches = [] matches = find.find(search_dir = sequencer_output_path, inclusion_patterns = inclusion_patterns, exclusion_patterns = exclusion_patterns, search_type = 'file', num_limit = 1, match_mode = "any") if len(matches) > 0: fastq_dir = os.path.dirname(matches[0]) logger.debug('Found .fastq files in directory: {0}\nThis directory will be used for the sns analysis'.format(fastq_dir)) return(fastq_dir) else: raise _e.AnalysisFileMissing(message = 'sequencer_output_path does not contain .fastq.gz files: {0}'.format(sequencer_output_path), errors = '')
def find_available_NextSeq_runs(sequencer_dir): ''' Find directories in the sequencer_dir that match sequencer_dir = "/ifs/data/molecpathlab/quicksilver" import find find.find(search_dir = sequencer_dir, search_type = 'dir', level_limit = 0) return a list of NextSeqRun objects ''' # directory name patterns that correspond to test and debug dirs that should be excluded from the monitoring program excludes = [ "to_be_demultiplexed", "automatic_demultiplexing_logs", "ArcherRun", "run_index", "*_test*", "*_run_before_sequencing_done*" ] sequencer_dirs = {} for item in find.find(search_dir=sequencer_dir, exclusion_patterns=excludes, search_type='dir', level_limit=0): item_id = os.path.basename(item) sequencer_dirs[item_id] = item runs = [ NextSeqRun( id=name, config=configs, extra_handlers=[x for x in log.get_all_handlers(logger=logger)]) for name, path in sequencer_dirs.items() ] NGS580_runs = [] for run in runs: if run.validate(): NGS580_runs.append(run) # logger.debug(NGS580_runs) return (NGS580_runs)
def get_output_files(self, analysis_step, pattern): """ Gets a file from the sample's analysis output, based on the ``analysis_output_index`` config listing the expected file types at each output, in addition to the criteria specified by the function args Parameters ---------- analysis_step: str the name of a directory in the analysis output from which to search for a sample's output file pattern: str a filename pattern to use when searching for the file Returns ------- list a list of files for the sample from an analysis step """ # get the dirpath for the analysis step from the analysis dir; return None if there isn't one set for the provided step search_dir = self.list_none( self.analysis_config['dirs'][analysis_step]) patterns = [pattern, self.search_pattern] f = [] if search_dir: # self.logger.debug("Searching for {0} files in {1}, dir: {2}".format(patterns, analysis_step, search_dir)) f = find.find(search_dir=search_dir, inclusion_patterns=patterns, search_type='file', match_mode='all') # self.logger.debug('Found: {0}'.format(f)) else: raise AnalysisItemMissing( message="search_dir not found for {0}, dir: {1}".format( analysis_step, search_dir), errors='') return (f)
def get_qsub_logfiles(self, logdir=None): """ Gets the list of log files from the analysis' qsub logs directory Parameters ---------- logdir: str the path to the qsub log directory. If ``None``, a directory called ``logs-qsub`` will be searched for in the analysis output directory and used instead Returns ------- list a list of file paths to qsub logs """ log_files = [] # try to get the logdir from self if not logdir: logdir = self.list_none(self.get_dirs('logs-qsub')) if not logdir: raise AnalysisItemMissing( message='Qsub log dir not found for the analysis', errors='') else: # find all the log files for item in find.find(logdir, search_type='file'): log_files.append(item) return (log_files)
def demo(): ''' Demo some functions of the program ''' find.find(search_dir='.', inclusion_patterns='*.py', num_limit=3) logger.debug("Here is the file handler: {0}".format( log.get_logger_handler(logger=logger, handler_name="main"))) find.find(search_dir='.', pattern='*.py') find.find(search_dir='.', pattern='t*', level_limit=1) find.find(search_dir='.', pattern='t*', search_type='file', level_limit=2)
def _init_files(self): """ Initializes the paths to files that might not have consistent naming including: the targets .bed file with the chromosome target regions """ self.set_file(name='targets_bed', path=find.find(search_dir=self.dir, inclusion_patterns="*.bed", exclusion_patterns='*.pad10.bed', search_type='file', num_limit=1, level_limit=0))
def find_completed_NGS580_runs(analysis_output_dir): ''' Find the NGS580 runs that have been done already return a dict of NGS580_dirs[item_id] = item ''' excludes = ["targets"] NGS580_dirs = {} for item in find.find(search_dir=analysis_output_dir, exclusion_patterns=excludes, search_type='dir', level_limit=0): item_id = os.path.basename(item) NGS580_dirs[item_id] = item # logger.debug(NGS580_dirs.items()) return (NGS580_dirs)
def find_samplesheets(): ''' Search for valid samplesheets, representing potential runs to check for demultiplexing ex: /ifs/data/molecpathlab/quicksilver/to_be_demultiplexed/NGS580/170519_NB501073_0010_AHCLLMBGX2-SampleSheet.csv ''' # global samplesheet_source_dir file_pattern = "*-SampleSheet.csv" samplesheet_files = [ item for item in find.find(search_dir=samplesheet_source_dir, inclusion_patterns=file_pattern, search_type='file', level_limit=1) ] logger.debug("Samplesheets found: {0}".format(samplesheet_files)) return (samplesheet_files)
def _init_dirs(self): """ Initializes the path attributes for items associated with the sequencing run from list of dirnames and filename patterns for the output steps in the sns WES analysis output Todo ---- This is obtaining configs from the local config file; don't use these configs anymore, dont use these attributes, need to remove them. When using this module with ``snsxt``, the tasks should instead get the files explicitly from the tasks' ``input_dir`` """ for name, attributes in self.analysis_output_index.items(): if name not in ['_parent']: self.set_dir(name=name, path=find.find(search_dir=self.dir, inclusion_patterns=name, search_type="dir", num_limit=1, level_limit=0))
def fastq_present(search_dir): """ Checks that '.fastq.gz' files are present in a directory Parameters ---------- search_dir: str path to directory to search Returns ------- bool ``True`` or ``False`` if any .fastq.gz file was found """ matches = None # check that .fastq files are present in the output matches = find.find(search_dir = search_dir, inclusion_patterns = ('*.fastq.gz',), search_type = 'file', num_limit = 1) return(bool(matches))
def search_for_samples_pairs_sheet(self, id, search_dir, sheet_pattern): ''' Search for a tumor-normal samples pairs samplesheet to match the current sequencing run from util import find search_dir = '/ifs/data/molecpathlab/quicksilver/to_be_demultiplexed/NGS580' id = '170824_NB501073_0020_AHHK37BGX3' sheet_pattern = '*-samples.pairs.csv' ''' sheet = [] patterns = [str(id) + '*', sheet_pattern] sheet = find.find(search_dir=search_dir, search_type='file', inclusion_patterns=patterns, level_limit=0, match_mode='all') self.logger.debug( 'Found tumor-normal samplesheet for run {0}: {1}'.format( id, sheet)) return (sheet)
def copy_sequencer_files(analysis_dir, sequencer_output_path, other_files = None): """ Copies files specified in ``settings.py`` from the ``sequencer_output_path`` directory to the ``analysis_dir`` Parameters ---------- sequencer_output_path: str path to the directory containing the sequencer output to search in analysis_dir: str path to the directory to copy analysis files to other_files: list a list of other files to copy over to the analysis_dir, or ``None`` """ # get the file basenames to search for from the settings sequencer_files = settings.sequencer_files.split(',') # get the files form the sequencer_dir to copy over copy_files = [] for sequencer_file in sequencer_files: logger.debug(sequencer_file) inclusion_patterns = (sequencer_file,) matches = find.find(search_dir = sequencer_output_path, inclusion_patterns = inclusion_patterns, search_type = 'file', num_limit = 1, match_mode = "any") logger.debug(matches) if len(matches) > 0: copy_files.append(matches[0]) # include any other files passed if other_files: for other_file in other_files: copy_files.append(other_file) # copy all the files over for copy_file in copy_files: file_basename = os.path.basename(copy_file) output_path = os.path.join(analysis_dir, file_basename) logger.debug('Copying file from:\n{0}\nto:\n{1}'.format(copy_file, output_path)) shutil.copy2(copy_file, output_path)
""" import os import sys import csv from util import samplesheet from util import find os.environ.setdefault("DJANGO_SETTINGS_MODULE", "tuco.settings") import django django.setup() from lims.models import SequencingSampleSheet sheets_dir = os.path.realpath(sys.argv[1]) # find all the samplesheets in the dir for sheet_file in find.find(search_dir = sheets_dir, inclusion_patterns = ['SampleSheet.csv']): sheet = samplesheet.IEMFile(path = os.path.realpath(sheet_file)) seqtype_file = os.path.join(os.path.dirname(sheet_file), 'seqtype.txt') Run_ID = os.path.basename(os.path.dirname(sheet_file)) # load seqtype if os.path.exists(seqtype_file): with open(seqtype_file) as f: lines = f.readlines() Seq_Type = lines[0].strip() # get the samplesheet entry from database sheet_instance = SequencingSampleSheet.objects.get(md5 = sheet.meta['Sheet_md5']) # check if it already has seq_type.. if not sheet_instance.seq_type or sheet_instance.seq_type == '' and Seq_Type != '':
... """ import os import sys import datetime from util import find os.environ.setdefault("DJANGO_SETTINGS_MODULE", "tuco.settings") import django django.setup() from lims.models import SequencingRun seq_dir = os.path.realpath(sys.argv[1]) for run_dir in find.find(search_dir=seq_dir, search_type='dir', exclusion_patterns=['.*'], level_limit=0): Run_ID = os.path.basename(run_dir) Run_path = os.path.realpath(run_dir) seqtype_file = os.path.join(Run_path, 'seqtype.txt') parts = Run_ID.split('_') Date = '' Sequencer_Serial = '' Run_Num = '' Flowcell_ID = '' # check if Run_ID can be parsed if len(parts) == 4: # ['180711', 'NB501073', '0057', 'AHFLL2BGX7'] Date = datetime.datetime.strptime(parts[0], '%y%m%d') Sequencer_Serial = parts[1] Run_Num = parts[2]
# get the date for the run from the first 6 digits of the runID runDate = datetime.datetime.strptime(config['runID'][:6], '%y%m%d').strftime('%Y-%m-%d') # read list of sampleIDs from the deliverables sheet sampleIDs = [] with open(deliverables_sheet) as f: for line in f: sampleIDs.append(line.strip()) # find all matching .fastq.gz files in the output locations deliverableFiles = [] for sampleID in sampleIDs: for item in find.find( search_dir=outputDir, inclusion_patterns=['{0}*.fastq.gz'.format(sampleID)]): deliverableFiles.append(item) deliverableFiles = list(set(deliverableFiles)) # set up deliverables directories deliverableSubdir = os.path.join(deliverableDir, deliverableID, runDate, 'fastq') try: os.makedirs(deliverableSubdir) except OSError: if not os.path.isdir(deliverableSubdir): raise # symlink the files to the deliverables dir for item in deliverableFiles: