Exemplo n.º 1
0
def find_fastq_parent_dir(sequencer_output_path):
    """
    Searches for the subdirectory containing .fastq.gz files. If .fastq.gz files are not found two levels deep in the passed ``sequencer_output_path``, then a subdirectory containing .fastq.gz files will be returned, if found. Otherwise, raises an exception if no .fastq.gz files could be found.

    Parameters
    ----------
    sequencer_output_path: str
        path to directory to search

    Returns
    -------
    str
        path to the directory with .fastq files to use for the ``sns`` analysis

    Notes
    -----
    Ignores 'Undetermined' .fastq files
    """
    inclusion_patterns = ('*.fastq.gz',)
    exclusion_patterns = ('*Undetermined*',)

    # search just the top 2 levels
    matches = []
    matches = find.find(search_dir = sequencer_output_path,
                        inclusion_patterns = inclusion_patterns,
                        exclusion_patterns = exclusion_patterns,
                        search_type = 'file',
                        num_limit = 1,
                        level_limit = 2,
                        match_mode = "any")
    if len(matches) > 0:
        logger.debug('Found .fastq files near the top level of the sequencer_output_path, returning sequencer_output_path: {0}'.format(sequencer_output_path))
        return(sequencer_output_path)

    # search deeper; this might take a while
    logger.debug('.fastq files were not found near the top level of the sequencer_output_path, searching deeper...')
    matches = []
    matches = find.find(search_dir = sequencer_output_path,
                        inclusion_patterns = inclusion_patterns,
                        exclusion_patterns = exclusion_patterns,
                        search_type = 'file',
                        num_limit = 1,
                        match_mode = "any")
    if len(matches) > 0:
        fastq_dir = os.path.dirname(matches[0])
        logger.debug('Found .fastq files in directory: {0}\nThis directory will be used for the sns analysis'.format(fastq_dir))
        return(fastq_dir)
    else:
        raise _e.AnalysisFileMissing(message = 'sequencer_output_path does not contain .fastq.gz files: {0}'.format(sequencer_output_path), errors = '')
def find_available_NextSeq_runs(sequencer_dir):
    '''
    Find directories in the sequencer_dir that match
    sequencer_dir = "/ifs/data/molecpathlab/quicksilver"
    import find
    find.find(search_dir = sequencer_dir, search_type = 'dir', level_limit = 0)

    return a list of NextSeqRun objects
    '''
    # directory name patterns that correspond to test and debug dirs that should be excluded from the monitoring program
    excludes = [
        "to_be_demultiplexed", "automatic_demultiplexing_logs", "ArcherRun",
        "run_index", "*_test*", "*_run_before_sequencing_done*"
    ]
    sequencer_dirs = {}
    for item in find.find(search_dir=sequencer_dir,
                          exclusion_patterns=excludes,
                          search_type='dir',
                          level_limit=0):
        item_id = os.path.basename(item)
        sequencer_dirs[item_id] = item
    runs = [
        NextSeqRun(
            id=name,
            config=configs,
            extra_handlers=[x for x in log.get_all_handlers(logger=logger)])
        for name, path in sequencer_dirs.items()
    ]

    NGS580_runs = []
    for run in runs:
        if run.validate():
            NGS580_runs.append(run)
    # logger.debug(NGS580_runs)
    return (NGS580_runs)
Exemplo n.º 3
0
    def get_output_files(self, analysis_step, pattern):
        """
        Gets a file from the sample's analysis output, based on the ``analysis_output_index`` config listing the expected file types at each output, in addition to the criteria specified by the function args

        Parameters
        ----------
        analysis_step: str
            the name of a directory in the analysis output from which to search for a sample's output file
        pattern: str
            a filename pattern to use when searching for the file

        Returns
        -------
        list
            a list of files for the sample from an analysis step
        """
        # get the dirpath for the analysis step from the analysis dir; return None if there isn't one set for the provided step
        search_dir = self.list_none(
            self.analysis_config['dirs'][analysis_step])
        patterns = [pattern, self.search_pattern]
        f = []
        if search_dir:
            # self.logger.debug("Searching for {0} files in {1}, dir: {2}".format(patterns, analysis_step, search_dir))
            f = find.find(search_dir=search_dir,
                          inclusion_patterns=patterns,
                          search_type='file',
                          match_mode='all')
            # self.logger.debug('Found: {0}'.format(f))
        else:
            raise AnalysisItemMissing(
                message="search_dir not found for {0}, dir: {1}".format(
                    analysis_step, search_dir),
                errors='')
        return (f)
Exemplo n.º 4
0
    def get_qsub_logfiles(self, logdir=None):
        """
        Gets the list of log files from the analysis' qsub logs directory

        Parameters
        ----------
        logdir: str
            the path to the qsub log directory. If ``None``, a directory called ``logs-qsub`` will be searched for in the analysis output directory and used instead

        Returns
        -------
        list
            a list of file paths to qsub logs

        """
        log_files = []
        # try to get the logdir from self
        if not logdir:
            logdir = self.list_none(self.get_dirs('logs-qsub'))
        if not logdir:
            raise AnalysisItemMissing(
                message='Qsub log dir not found for the analysis', errors='')
        else:
            # find all the log files
            for item in find.find(logdir, search_type='file'):
                log_files.append(item)
        return (log_files)
Exemplo n.º 5
0
def demo():
    '''
    Demo some functions of the program
    '''
    find.find(search_dir='.', inclusion_patterns='*.py', num_limit=3)
    logger.debug("Here is the file handler: {0}".format(
        log.get_logger_handler(logger=logger, handler_name="main")))
    find.find(search_dir='.', pattern='*.py')
    find.find(search_dir='.', pattern='t*', level_limit=1)
    find.find(search_dir='.', pattern='t*', search_type='file', level_limit=2)
Exemplo n.º 6
0
    def _init_files(self):
        """
        Initializes the paths to files that might not have consistent naming

        including: the targets .bed file with the chromosome target regions
        """
        self.set_file(name='targets_bed',
                      path=find.find(search_dir=self.dir,
                                     inclusion_patterns="*.bed",
                                     exclusion_patterns='*.pad10.bed',
                                     search_type='file',
                                     num_limit=1,
                                     level_limit=0))
def find_completed_NGS580_runs(analysis_output_dir):
    '''
    Find the NGS580 runs that have been done already

    return a dict of NGS580_dirs[item_id] = item
    '''
    excludes = ["targets"]
    NGS580_dirs = {}
    for item in find.find(search_dir=analysis_output_dir,
                          exclusion_patterns=excludes,
                          search_type='dir',
                          level_limit=0):
        item_id = os.path.basename(item)
        NGS580_dirs[item_id] = item
    # logger.debug(NGS580_dirs.items())
    return (NGS580_dirs)
def find_samplesheets():
    '''
    Search for valid samplesheets, representing potential runs to check for demultiplexing
    ex:
    /ifs/data/molecpathlab/quicksilver/to_be_demultiplexed/NGS580/170519_NB501073_0010_AHCLLMBGX2-SampleSheet.csv
    '''
    # global samplesheet_source_dir
    file_pattern = "*-SampleSheet.csv"
    samplesheet_files = [
        item for item in find.find(search_dir=samplesheet_source_dir,
                                   inclusion_patterns=file_pattern,
                                   search_type='file',
                                   level_limit=1)
    ]
    logger.debug("Samplesheets found: {0}".format(samplesheet_files))
    return (samplesheet_files)
Exemplo n.º 9
0
    def _init_dirs(self):
        """
        Initializes the path attributes for items associated with the sequencing run
        from list of dirnames and filename patterns for the output steps in the sns WES analysis output

        Todo
        ----
        This is obtaining configs from the local config file; don't use these configs anymore, dont use these attributes, need to remove them. When using this module with ``snsxt``, the tasks should instead get the files explicitly from the tasks' ``input_dir``
        """
        for name, attributes in self.analysis_output_index.items():
            if name not in ['_parent']:
                self.set_dir(name=name,
                             path=find.find(search_dir=self.dir,
                                            inclusion_patterns=name,
                                            search_type="dir",
                                            num_limit=1,
                                            level_limit=0))
Exemplo n.º 10
0
def fastq_present(search_dir):
    """
    Checks that '.fastq.gz' files are present in a directory

    Parameters
    ----------
    search_dir: str
        path to directory to search

    Returns
    -------
    bool
        ``True`` or ``False`` if any .fastq.gz file was found

    """
    matches = None
    # check that .fastq files are present in the output
    matches = find.find(search_dir = search_dir, inclusion_patterns = ('*.fastq.gz',), search_type = 'file', num_limit = 1)
    return(bool(matches))
Exemplo n.º 11
0
    def search_for_samples_pairs_sheet(self, id, search_dir, sheet_pattern):
        '''
        Search for a tumor-normal samples pairs samplesheet to match the current sequencing run

        from util import find
        search_dir = '/ifs/data/molecpathlab/quicksilver/to_be_demultiplexed/NGS580'
        id = '170824_NB501073_0020_AHHK37BGX3'
        sheet_pattern = '*-samples.pairs.csv'
        '''
        sheet = []
        patterns = [str(id) + '*', sheet_pattern]
        sheet = find.find(search_dir=search_dir,
                          search_type='file',
                          inclusion_patterns=patterns,
                          level_limit=0,
                          match_mode='all')
        self.logger.debug(
            'Found tumor-normal samplesheet for run {0}: {1}'.format(
                id, sheet))
        return (sheet)
Exemplo n.º 12
0
def copy_sequencer_files(analysis_dir, sequencer_output_path, other_files = None):
    """
    Copies files specified in ``settings.py`` from the ``sequencer_output_path`` directory to the ``analysis_dir``

    Parameters
    ----------
    sequencer_output_path: str
        path to the directory containing the sequencer output to search in
    analysis_dir: str
        path to the directory to copy analysis files to
    other_files: list
        a list of other files to copy over to the analysis_dir, or ``None``
    """
    # get the file basenames to search for from the settings
    sequencer_files = settings.sequencer_files.split(',')
    # get the files form the sequencer_dir to copy over
    copy_files = []
    for sequencer_file in sequencer_files:
        logger.debug(sequencer_file)
        inclusion_patterns = (sequencer_file,)
        matches = find.find(search_dir = sequencer_output_path,
                            inclusion_patterns = inclusion_patterns,
                            search_type = 'file',
                            num_limit = 1,
                            match_mode = "any")
        logger.debug(matches)
        if len(matches) > 0:
            copy_files.append(matches[0])

    # include any other files passed
    if other_files:
        for other_file in other_files:
            copy_files.append(other_file)

    # copy all the files over
    for copy_file in copy_files:
        file_basename = os.path.basename(copy_file)
        output_path = os.path.join(analysis_dir, file_basename)
        logger.debug('Copying file from:\n{0}\nto:\n{1}'.format(copy_file, output_path))
        shutil.copy2(copy_file, output_path)
Exemplo n.º 13
0
"""
import os
import sys
import csv
from util import samplesheet
from util import find
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "tuco.settings")
import django
django.setup()
from lims.models import SequencingSampleSheet


sheets_dir = os.path.realpath(sys.argv[1])

#  find all the samplesheets in the dir
for sheet_file in find.find(search_dir = sheets_dir, inclusion_patterns = ['SampleSheet.csv']):
    sheet = samplesheet.IEMFile(path = os.path.realpath(sheet_file))
    seqtype_file = os.path.join(os.path.dirname(sheet_file), 'seqtype.txt')
    Run_ID = os.path.basename(os.path.dirname(sheet_file))

    # load seqtype
    if os.path.exists(seqtype_file):
        with open(seqtype_file) as f:
            lines = f.readlines()
            Seq_Type = lines[0].strip()

            # get the samplesheet entry from database
            sheet_instance = SequencingSampleSheet.objects.get(md5 = sheet.meta['Sheet_md5'])

            # check if it already has seq_type..
            if not sheet_instance.seq_type or sheet_instance.seq_type == '' and Seq_Type != '':
Exemplo n.º 14
0
...

"""
import os
import sys
import datetime
from util import find
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "tuco.settings")
import django
django.setup()
from lims.models import SequencingRun

seq_dir = os.path.realpath(sys.argv[1])

for run_dir in find.find(search_dir=seq_dir,
                         search_type='dir',
                         exclusion_patterns=['.*'],
                         level_limit=0):
    Run_ID = os.path.basename(run_dir)
    Run_path = os.path.realpath(run_dir)
    seqtype_file = os.path.join(Run_path, 'seqtype.txt')
    parts = Run_ID.split('_')
    Date = ''
    Sequencer_Serial = ''
    Run_Num = ''
    Flowcell_ID = ''

    # check if Run_ID can be parsed
    if len(parts) == 4:  # ['180711', 'NB501073', '0057', 'AHFLL2BGX7']
        Date = datetime.datetime.strptime(parts[0], '%y%m%d')
        Sequencer_Serial = parts[1]
        Run_Num = parts[2]
Exemplo n.º 15
0
# get the date for the run from the first 6 digits of the runID
runDate = datetime.datetime.strptime(config['runID'][:6],
                                     '%y%m%d').strftime('%Y-%m-%d')

# read list of sampleIDs from the deliverables sheet
sampleIDs = []
with open(deliverables_sheet) as f:
    for line in f:
        sampleIDs.append(line.strip())

# find all matching .fastq.gz files in the output locations
deliverableFiles = []
for sampleID in sampleIDs:
    for item in find.find(
            search_dir=outputDir,
            inclusion_patterns=['{0}*.fastq.gz'.format(sampleID)]):
        deliverableFiles.append(item)
deliverableFiles = list(set(deliverableFiles))

# set up deliverables directories
deliverableSubdir = os.path.join(deliverableDir, deliverableID, runDate,
                                 'fastq')
try:
    os.makedirs(deliverableSubdir)
except OSError:
    if not os.path.isdir(deliverableSubdir):
        raise

# symlink the files to the deliverables dir
for item in deliverableFiles: