Exemplo n.º 1
0
    def __init__(self,fastq_strand_out):
        """
        Create a new Fastqstrand instance

        """
        self._fastq_strand_out = os.path.abspath(fastq_strand_out)
        self._version = None
        self._genomes = AttributeDictionary()
        # Read in data
        tabfile = None
        with open(self._fastq_strand_out,'r') as fp:
            for line in fp:
                line = line.strip()
                if line.startswith('#fastq_strand version:'):
                    self._version = line.split()[2]
                    continue
                elif line.startswith('#Genome'):
                    tabfile = TabFile(column_names=line[1:].split('\t'))
                    continue
                tabfile.append(tabdata=line)
        # Check there is some data
        if tabfile is None:
            raise Exception("Unable to extract fastq_strand data from %s" %
                            self._fastq_strand_out)
        # Copy data to main object
        for line in tabfile:
            # Store the data
            data = AttributeDictionary()
            self._genomes[line['Genome']] = data
            data['forward'] = line['1st forward']
            data['reverse'] = line['2nd reverse']
            # Additional processing
            if data.reverse > 0.0:
                ratio = float(data.forward)/float(data.reverse)
            elif data.forward > 0.0:
                ratio = float("+inf")
            else:
                ratio = None
            if ratio is not None:
                if ratio < 0.2:
                    strandedness = "reverse"
                elif ratio > 5 or ratio == float("+inf"):
                    strandedness = "forward"
                else:
                    strandedness = "unstranded?"
            else:
                strandedness = "undetermined"
            data['ratio'] = ratio
            data['strandedness'] = strandedness
Exemplo n.º 2
0
    def get_organism_config(self, section=None, config=None):
        """
        Retrieve 'organism' configuration options from .ini file

        Given the name of a section (e.g. 'organism:Human'),
        fetch the data association with the organism and return in
        an AttributeDictionary object.

        The items that can be extracted are:

        - star_index (str, path to STAR index)
        - bowtie_index (str, path to Bowtie index)
        - cellranger_reference (str)
        - cellranger_premrna_reference (str)
        - cellranger_atac_reference (str)
        - cellranger_arc_reference (str)

        Arguments:
          section (str): name of the section to retrieve the
            settings from
          config (Config): Config object with settings loaded

        Returns:
          AttributeDictionary: dictionary of option:value pairs.
        """
        values = AttributeDictionary()
        for param in ('star_index', 'bowtie_index', 'cellranger_reference',
                      'cellranger_premrna_reference',
                      'cellranger_atac_reference', 'cellranger_arc_reference'):
            if section and config:
                values[param] = config.get(section, param, None)
            else:
                values[param] = None
        return values
def fetch_protocol_definition(name):
    """
    Return the definition for a QC protocol

    Arguments:
      name (str): name of the QC protocol

    Returns:
      Tuple: definition as a tuple of the form
        (reads,qc_modules) where 'reads' is an
        AttributeDictionary with elements 'seq_data',
        'index', and 'qc' (listing sequence data,
        index reads, and all reads for QC,
        respectively) and 'qc_modules' is a list
        of QC module definitions.
    """
    if name not in QC_PROTOCOLS:
        raise KeyError("%s: undefined QC protocol" % name)
    protocol_defn = QC_PROTOCOLS[name]
    reads = AttributeDictionary()
    try:
        reads['seq_data'] = list(protocol_defn['reads']['seq_data'])
        reads['index'] = list(protocol_defn['reads']['index'])
        reads['qc'] = sorted(reads.seq_data + reads.index)
        qc_modules = [m for m in protocol_defn['qc_modules']]
    except KeyError as ex:
        raise Exception("%s: exception loading QC protocol "
                        "definition: %s" % (name, ex))
    return (reads, qc_modules)
Exemplo n.º 4
0
    def add_section(self, section):
        """
        Add a new section

        Arguments:
          section (str): an identifier of the form
            SECTION[:SUBSECTION] which specifies the
            section to add

        """
        try:
            section, subsection = section.split(':')
            if section not in self._sections:
                self.add_section(section)
            getattr(self, section)[subsection] = AttributeDictionary()
        except ValueError:
            self._sections.append(section)
            setattr(self, section, AttributeDictionary())
Exemplo n.º 5
0
    def get_bcl2fastq_config(self,section,config):
        """
        Retrieve bcl2fastq configuration options from .ini file

        Given the name of a section (e.g. 'blc2fastq',
        'platform:miseq'), fetch the bcl2fastq settings and return
        in an AttributeDictionary object.

        The options that can be extracted are:

        - default_version
        - bcl2fastq
        - nprocessors
        - no_lane_splitting
        - create_empty_fastqs

        Arguments:
          section (str): name of the section to retrieve the
            settings from
          config (Config): Config object with settings loaded

        Returns:
          AttributeDictionary: dictionary of option:value pairs.

        """
        values = AttributeDictionary()
        if section == 'bcl2fastq':
            values['default_version'] = config.get(section,'default_version',
                                                   None)
            values['nprocessors'] = config.getint(section,'nprocessors',1)
            values['no_lane_splitting'] = config.getboolean(section,'no_lane_splitting',
                                                            False)
            values['create_empty_fastqs'] = config.getboolean(
                section,
                'create_empty_fastqs',
                True)
        else:
            values['bcl2fastq'] = config.get(section,'bcl2fastq',None)
            values['nprocessors'] = config.getint(section,'nprocessors',None)
            values['no_lane_splitting'] = config.getboolean(section,'no_lane_splitting',
                                                            None)
            values['create_empty_fastqs'] = config.getboolean(
                section,
                'create_empty_fastqs',
                None)
        return values
Exemplo n.º 6
0
    def get_sequencer_config(self, section, config):
        """
        Retrieve 'sequencer' configuration options from .ini file

        Given the name of a section (e.g. 'sequencer:SN7001250'),
        fetch the data associated with the sequencer instrument
        and return in an AttributeDictionary object.

        The items that can be extracted are:

        - platform (compulsory, str)
        - model (str, default 'None')

        Arguments:
          section (str): name of the section to retrieve the
            settings from
          config (Config): Config object with settings loaded

        Returns:
          AttributeDictionary: dictionary of option:value pairs.
        """
        values = AttributeDictionary()
        values['platform'] = config.get(section, 'platform', None)
        values['model'] = config.get(section, 'model', None)
        if values['platform'] is None:
            raise Exception("%s: missing required 'platform'" % section)
        if values['model']:
            # Strip quotes
            model = values['model']
            while model[0] in (
                    '"',
                    '\'',
            ) and model[-1] in (
                    '"',
                    '\'',
            ):
                model = model[1:-1]
            values['model'] = model
        return values
Exemplo n.º 7
0
    def get_destination_config(self, section, config):
        """
        Retrieve 'destination' configuration options from .ini file

        Given the name of a section (e.g. 'destination:webserver'),
        fetch the associated data transfer settings and return
        in an AttributeDictionary object.

        The options that can be extracted are:

        - directory (compulsory, str)
        - subdir (optional, str, default 'None')
        - readme_template (optional, str, default 'None')
        - url (optional, str, default 'None')
        - include_downloader (optional, boolean, default 'False')
        - include_qc_report (optional, boolean, default 'False')
        - hard_links (optional, boolean, default 'False')

        Arguments:
          section (str): name of the section to retrieve the
            settings from
          config (Config): Config object with settings loaded

        Returns:
          AttributeDictionary: dictionary of option:value pairs.

        """
        values = AttributeDictionary()
        values['directory'] = config.get(section, 'directory', None)
        values['subdir'] = config.get(section, 'subdir', None)
        values['readme_template'] = config.get(section, 'readme_template',
                                               None)
        values['url'] = config.get(section, 'url', None)
        values['include_downloader'] = config.getboolean(
            section, 'include_downloader', False)
        values['include_qc_report'] = config.getboolean(
            section, 'include_qc_report', False)
        values['hard_links'] = config.getboolean(section, 'hard_links', False)
        return values
    def analyse(self,lane=None,sample_sheet=None,cutoff=None,
                mismatches=0):
        """
        Analyse barcode frequencies

        Returns a dictionary with the following keys:

        - barcodes: list of barcodes (or reference barcodes,
          if mismatches > 0)
        - cutoff: the specified cutoff fraction
        - mismatches: the specified number of mismatches to
          allow
        - total_reads: the total number of reads for the
          specified lane (or all reads, if no lane was
          specified)
        - coverage: the number of reads after cutoffs have
          been applied
        - counts: dictionary with barcodes from the 'barcodes'
          list as keys; each key points to a dictionary with
          keys:
          * reads: number of reads associated with this barcode
            (or group, if mismatches > 0)
          * sample: name of the associated sample (if a sample
            sheet was supplied, otherwise 'None')
          * sequences: number of sequences in the group (always
            1 if mismatches == 0)

        Arguments:
          lane (integer): lane to restrict analysis to (None
            analyses all lanes)
          sample_sheet (str): sample sheet file to compare
            barcodes against (None skips comparison)
          cutoff (float): if mismatches == 0 then barcodes must
            have at least this fraction of reads to be included;
            (if mismatches > 0 then this condition is applied to
            groups instead)

        """
        sample_lookup = {}
        if sample_sheet is not None:
            sample_sheet = SampleSheetBarcodes(sample_sheet)
            sample_sheet_barcodes = sample_sheet.barcodes(lane)
        else:
            sample_sheet_barcodes = None
        if not mismatches:
            groups = None
            barcodes = self.filter_barcodes(cutoff=cutoff,lane=lane)
        else:
            groups = self.group(lane,mismatches=mismatches,
                                seed_barcodes=sample_sheet_barcodes,
                                cutoff=cutoff)
            barcodes = [grp.reference for grp in groups]
        analysis = AttributeDictionary(
            barcodes=barcodes,
            cutoff=cutoff,
            counts=dict(),
            total_reads=self.nreads(lane=lane),
            mismatches=mismatches
        )
        cum_reads = 0
        if groups:
            for group in groups:
                barcode = group.reference
                barcode_reads = group.counts
                cum_reads += barcode_reads
                try:
                    # Exact match
                    sample = sample_sheet.lookup_sample(barcode,lane)
                except KeyError:
                    # Closest match(es)
                    sample = []
                    for seq in sample_sheet.barcodes(lane):
                        if group.match(seq,mismatches):
                            sample.append(sample_sheet.lookup_sample(seq,lane))
                    if sample:
                        sample = ','.join(sample)
                    else:
                        sample = None
                except AttributeError:
                    # No sample sheet
                    sample = None
                analysis.counts[barcode] = AttributeDictionary(
                    reads=barcode_reads,
                    sample=sample,
                    sequences=len(group)
                )
        else:
            for barcode in barcodes:
                barcode_reads = self.counts(barcode,lane)
                cum_reads += barcode_reads
                try:
                    sample = sample_sheet.lookup_sample(barcode,lane)
                except (KeyError,AttributeError):
                    sample = None
                analysis.counts[barcode] = AttributeDictionary(
                    reads=barcode_reads,
                    sample=sample,
                    sequences=1
                )
        analysis['coverage'] = cum_reads
        return analysis
Exemplo n.º 9
0
    def __init__(self, settings_file=None):
        """
        Create new Settings instance

        If 'settings_file' is specified then this should be the
        full path to an appropriately formatted '.ini' file.

        Otherwise the class will attempt to locate an appropriate
        file to use: by default this will be a file called
        'auto_process.ini' which will exist somewhere in the
        search path defined by the 'locate_settings_file'
        function; if no file with this name can be found then
        the class will fallback to looking for a file with the
        older 'settings.ini' file name.
        
        """
        # Initialise list of sections
        self._sections = []
        # Locate settings file
        if settings_file is None:
            # Look for default
            self.settings_file = locate_settings_file(name="auto_process.ini",
                                                      create_from_sample=False)
            if self.settings_file is None:
                # Fallback to old name
                self.settings_file = locate_settings_file(
                    name="settings.ini", create_from_sample=False)
        else:
            self.settings_file = os.path.abspath(settings_file)
        # Import site-specific settings from local version
        config = Config()
        if self.settings_file:
            config.read(self.settings_file)
        else:
            # Look for sample settings file
            config.read(
                os.path.join(get_config_dir(), 'auto_process.ini.sample'))
        # General parameters
        self.add_section('general')
        default_runner = config.get('general', 'default_runner',
                                    'SimpleJobRunner')
        self.general['default_runner'] = config.getrunner(
            'general', 'default_runner', 'SimpleJobRunner')
        self.general['max_concurrent_jobs'] = config.getint(
            'general', 'max_concurrent_jobs', 12)
        self.general['max_cores'] = config.getint('general', 'max_cores')
        self.general['max_batches'] = config.getint('general', 'max_batches')
        self.general['poll_interval'] = config.getfloat(
            'general', 'poll_interval', 5)
        # modulefiles
        self.add_section('modulefiles')
        self.modulefiles['make_fastqs'] = config.get('modulefiles',
                                                     'make_fastqs')
        self.modulefiles['bcl2fastq'] = config.get('modulefiles', 'bcl2fastq')
        self.modulefiles['bcl_convert'] = config.get('modulefiles',
                                                     'bcl_convert')
        self.modulefiles['cellranger_mkfastq'] = config.get(
            'modulefiles', 'cellranger_mkfastq')
        self.modulefiles['cellranger_atac_mkfastq'] = config.get(
            'modulefiles', 'cellranger_atac_mkfastq')
        self.modulefiles['cellranger_arc_mkfastq'] = config.get(
            'modulefiles', 'cellranger_arc_mkfastq')
        self.modulefiles['spaceranger_mkfastq'] = config.get(
            'modulefiles', 'spaceranger_mkfastq')
        self.modulefiles['run_qc'] = config.get('modulefiles', 'run_qc')
        self.modulefiles['publish_qc'] = config.get('modulefiles',
                                                    'publish_qc')
        self.modulefiles['process_icell8'] = config.get(
            'modulefiles', 'process_icell8')
        self.modulefiles['fastqc'] = config.get('modulefiles', 'fastqc')
        self.modulefiles['fastq_screen'] = config.get('modulefiles',
                                                      'fastq_screen')
        self.modulefiles['fastq_strand'] = config.get('modulefiles',
                                                      'fastq_strand')
        self.modulefiles['cellranger'] = config.get('modulefiles',
                                                    'cellranger')
        self.modulefiles['report_qc'] = config.get('modulefiles', 'report_qc')
        self.modulefiles['cutadapt'] = config.get('modulefiles', 'cutadapt')
        # Handle legacy 'illumina_qc' modulefile
        legacy_illumina_qc_modulefiles = config.get('modulefiles',
                                                    'illumina_qc')
        if legacy_illumina_qc_modulefiles:
            if not self.modulefiles['fastqc']:
                logger.warning("Setting 'fastqc' modulefile parameter "
                               "using deprecated 'illumina_qc' parameter")
                self.modulefiles['fastqc'] = legacy_illumina_qc_modulefiles
            if not self.modulefiles['fastq_screen']:
                logger.warning("Setting 'fastq_screen' modulefile parameter "
                               "using deprecated 'illumina_qc' parameter")
                self.modulefiles['fastq_screen'] = \
                                legacy_illumina_qc_modulefiles
        # conda
        self.add_section('conda')
        self.conda['enable_conda'] = config.getboolean('conda', 'enable_conda',
                                                       False)
        self.conda['env_dir'] = config.get('conda', 'env_dir', None)
        if self.conda['env_dir']:
            self.conda['env_dir'] = os.path.expandvars(self.conda.env_dir)
        # bcl_conversion
        self.add_section('bcl_conversion')
        # Add settings from legacy bcl2fastq section first
        self.bcl_conversion = self.get_bcl_converter_config(
            'bcl2fastq', config)
        # Update with settings from bcl_conversion section
        self.get_bcl_converter_config('bcl_conversion', config,
                                      self.bcl_conversion)
        # qc
        self.add_section('qc')
        self.qc['nprocessors'] = config.getint('qc', 'nprocessors', None)
        self.qc['fastq_screens'] = config.get('qc', 'fastq_screens', None)
        self.qc['fastq_screen_subset'] = config.getint('qc',
                                                       'fastq_screen_subset',
                                                       100000)
        self.qc['use_legacy_screen_names'] = \
                                             config.getboolean(
                                                 'qc',
                                                 'use_legacy_screen_names',
                                                 False)
        # Fastq screens
        self.add_section('screens')
        for section in filter(lambda x: x.startswith('screen:'),
                              config.sections()):
            screen = section.split(':')[1]
            self.screens[screen] = AttributeDictionary(conf_file=None)
            self.screens[screen]['conf_file'] = config.get(
                section, 'conf_file', None)
        # Organisms
        self.add_section('organisms')
        for section in filter(lambda x: x.startswith('organism:'),
                              config.sections()):
            organism = section.split(':')[1]
            self.organisms[organism] = self.get_organism_config(
                section, config)
        # Handle legacy STAR index specifications (fastq_strand_indexes)
        try:
            for organism, index_file in config.items('fastq_strand_indexes'):
                if organism not in self.organisms:
                    self.organisms[organism] = self.get_organism_config()
                self['organisms'][organism]['star_index'] = index_file
            logger.warning("Added STAR index information from "
                           "deprecated 'fastq_strand_indexes' section (use "
                           "'organism:ORGANISM' sections instead)")
        except NoSectionError:
            pass
        # Legacy 10xgenomics transcriptome references
        try:
            for organism, reference in config.items(
                    '10xgenomics_transcriptomes'):
                if organism not in self.organisms:
                    self.organisms[organism] = self.get_organism_config()
                self['organisms'][organism]['cellranger_reference'] = reference
            logger.warning("Added cellranger references from deprecated "
                           "'10xgenomics_transcriptomes' section (use "
                           "'organism:ORGANISM' sections instead)")
        except NoSectionError:
            pass
        # Legacy 10xgenomics snRNA-seq pre-mRNA references
        try:
            for organism, reference in config.items(
                    '10xgenomics_premrna_references'):
                if organism not in self.organisms:
                    self.organisms[organism] = self.get_organism_config()
                self['organisms'][organism][
                    'cellranger_premrna_reference'] = reference
            logger.warning("Added cellranger pre-mRNA references from "
                           "deprecated '10xgenomics_premrna_references' "
                           "section (use 'organism:ORGANISM' sections "
                           "instead)")
        except NoSectionError:
            pass
        # Legacy 10xgenomics scATAC-seq genome references
        try:
            for organism, reference in config.items(
                    '10xgenomics_atac_genome_references'):
                if organism not in self.organisms:
                    self.organisms[organism] = self.get_organism_config()
                self['organisms'][organism][
                    'cellranger_atac_reference'] = reference
            logger.warning("Added cellranger-atac references from deprecated "
                           "'10xgenomics_atac_genome_references' section "
                           "(use 'organism:ORGANISM' sections instead)")
        except NoSectionError:
            pass
        # Legacy 10xGenomics cellranger ARC single cell multiome references
        try:
            for organism, reference in config.items(
                    '10xgenomics_multiome_references'):
                if organism not in self.organisms:
                    self.organisms[organism] = self.get_organism_config()
                self['organisms'][organism][
                    'cellranger_arc_reference'] = reference
            logger.warning("Added cellranger-arc references from deprecated "
                           "'10xgenomics_multiome_references' section "
                           "(use 'organism:ORGANISM' sections instead)")
        except NoSectionError:
            pass
        # Sequencers
        self.add_section('sequencers')
        for section in filter(lambda x: x.startswith('sequencer:'),
                              config.sections()):
            instrument = section.split(':')[1]
            self.sequencers[instrument] = self.get_sequencer_config(
                section, config)
        # Add any settings legacy 'sequencers' section
        try:
            for instrument, platform in config.items('sequencers'):
                if instrument not in self.sequencers:
                    self['sequencers'][instrument] = \
                        AttributeDictionary(platform=None,
                                            model=None)
                self['sequencers'][instrument]['platform'] = platform
            logger.warning("Added sequencer information from "
                           "deprecated 'sequencers' section (use "
                           "'sequencer:INSTRUMENT' sections "
                           "instead)")
        except NoSectionError:
            pass
        # Sequencing platform-specific defaults
        self.add_section('platform')
        for section in filter(lambda x: x.startswith('platform:'),
                              config.sections()):
            platform = section.split(':')[1]
            self.platform[platform] = self.get_bcl_converter_config(
                section, config)
        # Handle deprecated bcl2fastq settings
        for platform in ('hiseq', 'miseq', 'nextseq'):
            if config.has_option('bcl2fastq', platform):
                logger.warning("Deprecated setting in [bcl2fastq]: '%s'" %
                               platform)
            try:
                bcl2fastq = self.platform[platform]['bcl2fastq']
            except KeyError:
                bcl2fastq = config.get('bcl2fastq', platform)
                if bcl2fastq is None:
                    continue
                logger.warning(
                    "Setting 'bcl2fastq' in '[platform:%s]' to '%s'" %
                    (platform, bcl2fastq))
                if platform not in self.platform:
                    self.platform[platform] = AttributeDictionary()
                self.platform[platform]['bcl2fastq'] = bcl2fastq
        # Metadata defaults
        self.add_section('metadata')
        self.metadata['default_data_source'] = config.get(
            'metadata', 'default_data_source')
        # icell8
        self.add_section('icell8')
        self.icell8['aligner'] = config.get('icell8', 'aligner')
        self.icell8['batch_size'] = config.getint('icell8', 'batch_size',
                                                  5000000)
        self.icell8['mammalian_conf_file'] = config.get(
            'icell8', 'mammalian_conf_file')
        self.icell8['contaminants_conf_file'] = config.get(
            'icell8', 'contaminants_conf_file')
        self.icell8['nprocessors_contaminant_filter'] = config.getint(
            'icell8', 'nprocessors_contaminant_filter', None)
        self.icell8['nprocessors_statistics'] = config.getint(
            'icell8', 'nprocessors_statistics', None)
        # 10xgenomics
        self.add_section('10xgenomics')
        self['10xgenomics']['cellranger_jobmode'] = config.get(
            '10xgenomics', 'cellranger_jobmode', 'local')
        self['10xgenomics']['cellranger_maxjobs'] = config.getint(
            '10xgenomics', 'cellranger_maxjobs', 24)
        self['10xgenomics']['cellranger_mempercore'] = config.getint(
            '10xgenomics', 'cellranger_mempercore', 5)
        self['10xgenomics']['cellranger_jobinterval'] = config.getint(
            '10xgenomics', 'cellranger_jobinterval', 100)
        self['10xgenomics']['cellranger_localmem'] = config.getint(
            '10xgenomics', 'cellranger_localmem', 5)
        self['10xgenomics']['cellranger_localcores'] = config.getint(
            '10xgenomics', 'cellranger_localcores', None)
        # fastq_stats
        self.add_section('fastq_stats')
        self.fastq_stats['nprocessors'] = config.getint(
            'fastq_stats', 'nprocessors', None)
        # Define runners for specific jobs
        self.add_section('runners')
        for name in (
                'bcl2fastq',
                'bcl_convert',
                'qc',
                'star',
                'stats',
                'rsync',
                'icell8',
                'icell8_contaminant_filter',
                'icell8_statistics',
                'icell8_report',
                'cellranger',
        ):
            self.runners[name] = config.getrunner('runners', name,
                                                  default_runner)
        # Handle new runners that default to the 'qc' runner
        for name in (
                'fastqc',
                'fastq_screen',
                'star',
        ):
            self.runners[name] = config.getrunner('runners', name,
                                                  self.runners.qc)
        # Information for archiving analyses
        # dirn should be a directory in the form [[user@]host:]path]
        self.add_section('archive')
        self.archive['dirn'] = config.get('archive', 'dirn', None)
        self.archive['log'] = config.get('archive', 'log', None)
        self.archive['group'] = config.get('archive', 'group', None)
        self.archive['chmod'] = config.get('archive', 'chmod', None)
        # Information for uploading QC reports
        # dirn should be a directory in the form [[user@]host:]path]
        self.add_section('qc_web_server')
        self.qc_web_server['dirn'] = config.get('qc_web_server', 'dirn', None)
        self.qc_web_server['url'] = config.get('qc_web_server', 'url', None)
        self.qc_web_server['use_hierarchy'] = config.getboolean(
            'qc_web_server', 'use_hierarchy')
        self.qc_web_server['exclude_zip_files'] = config.getboolean(
            'qc_web_server', 'exclude_zip_files')
        # Templates for reporting project data
        self.add_section('reporting_templates')
        try:
            for template, fields in config.items('reporting_templates'):
                self['reporting_templates'][template] = fields
        except NoSectionError:
            logger.debug("No reporting templates defined")
        # Destinations for data transfer
        self.add_section('destination')
        for section in filter(lambda x: x.startswith('destination:'),
                              config.sections()):
            dest = section.split(':')[1]
            self.destination[dest] = self.get_destination_config(
                section, config)
Exemplo n.º 10
0
    def get_bcl_converter_config(self, section, config, attr_dict=None):
        """
        Retrieve BCL conversion configuration options from .ini file

        Given the name of a section (e.g. 'bcl_conversion',
        'platform:miseq'), fetch the BCL converter settings and return
        in an AttributeDictionary object.

        The options that can be extracted are:

        - bcl_converter
        - nprocessors
        - no_lane_splitting
        - create_empty_fastqs

        There are also some legacy options:

        - default_version
        - bcl2fastq

        Arguments:
          section (str): name of the section to retrieve the
            settings from
          config (Config): Config object with settings loaded
          attr_dict (AttributeDictionary): optional, existing
            AttributeDictionary which will be added to

        Returns:
          AttributeDictionary: dictionary of option:value pairs.

        """
        if attr_dict:
            values = attr_dict
        else:
            values = AttributeDictionary()
        if section == 'bcl2fastq':
            # Deprecated [bcl2fastq] section
            value = config.get(section, 'default_version', None)
            if value:
                values['bcl_converter'] = "bcl2fastq%s" % value
        else:
            # [bcl_conversion] and [platform:...] sections
            bcl2fastq = config.get(section, 'bcl2fastq', None)
            value = config.get(section, 'bcl_converter', None)
            if value:
                values['bcl_converter'] = value
            elif bcl2fastq is not None:
                values['bcl_converter'] = "bcl2fastq%s" % bcl2fastq
            elif 'bcl_converter' not in values:
                values['bcl_converter'] = None
        # Common settings
        value = config.getint(section, 'nprocessors', None)
        if value or 'nprocessors' not in values:
            values['nprocessors'] = value
        value = config.getboolean(section, 'no_lane_splitting', None)
        if value is not None or 'no_lane_splitting' not in values:
            values['no_lane_splitting'] = value
        value = config.getboolean(section, 'create_empty_fastqs', None)
        if value is not None or 'create_empty_fastqs' not in values:
            values['create_empty_fastqs'] = value
        return values
    def analyse(self,
                lane=None,
                sample_sheet=None,
                cutoff=None,
                mismatches=0,
                minimum_read_fraction=0.000001):
        """
        Analyse barcode frequencies

        Returns a dictionary with the following keys:

        - barcodes: list of barcodes (or reference barcodes,
          if mismatches > 0)
        - cutoff: the specified cutoff fraction
        - mismatches: the specified number of mismatches to
          allow
        - total_reads: the total number of reads for the
          specified lane (or all reads, if no lane was
          specified)
        - coverage: the number of reads after cutoffs have
          been applied
        - counts: dictionary with barcodes from the 'barcodes'
          list as keys; each key points to a dictionary with
          keys:
          * reads: number of reads associated with this barcode
            (or group, if mismatches > 0)
          * sample: name of the associated sample (if a sample
            sheet was supplied, otherwise 'None')
          * sequences: number of sequences in the group (always
            1 if mismatches == 0)

        Arguments:
          lane (integer): lane to restrict analysis to (None
            analyses all lanes)
          sample_sheet (str): sample sheet file to compare
            barcodes against (None skips comparison)
          cutoff (float): if mismatches == 0 then barcodes must
            have at least this fraction of reads to be included;
            (if mismatches > 0 then this condition is applied to
            groups instead)
          mismatches (integer): maximum number of mismatched
            bases allowed when matching barcodes (default is 0
            i.e. exact matches only)
          minimum_read_fraction: speed-up parameter, excludes
            barcodes with less than this fraction of associated
            reads (speeds up the grouping calculation at the
            cost of some precision)

        """
        sample_lookup = {}
        if sample_sheet is not None:
            sample_sheet = SampleSheetBarcodes(sample_sheet)
            sample_sheet_barcodes = sample_sheet.barcodes(lane)
        else:
            sample_sheet_barcodes = None
        if not mismatches:
            groups = None
            barcodes = self.filter_barcodes(cutoff=cutoff, lane=lane)
        else:
            groups = self.group(lane,
                                mismatches=mismatches,
                                seed_barcodes=sample_sheet_barcodes,
                                cutoff=cutoff,
                                minimum_read_fraction=minimum_read_fraction)
            barcodes = [grp.reference for grp in groups]
        analysis = AttributeDictionary(barcodes=barcodes,
                                       cutoff=cutoff,
                                       counts=dict(),
                                       total_reads=self.nreads(lane=lane),
                                       mismatches=mismatches)
        cum_reads = 0
        if groups:
            for group in groups:
                barcode = group.reference
                barcode_reads = group.counts
                cum_reads += barcode_reads
                try:
                    # Exact match
                    sample = sample_sheet.lookup_sample(barcode, lane)
                except KeyError:
                    # Closest match(es)
                    sample = []
                    for seq in sample_sheet.barcodes(lane):
                        if group.match(seq, mismatches):
                            sample.append(sample_sheet.lookup_sample(
                                seq, lane))
                    if sample:
                        sample = ','.join(sample)
                    else:
                        sample = None
                except AttributeError:
                    # No sample sheet
                    sample = None
                analysis.counts[barcode] = AttributeDictionary(
                    reads=barcode_reads, sample=sample, sequences=len(group))
        else:
            for barcode in barcodes:
                barcode_reads = self.counts(barcode, lane)
                cum_reads += barcode_reads
                try:
                    sample = sample_sheet.lookup_sample(barcode, lane)
                except (KeyError, AttributeError):
                    sample = None
                analysis.counts[barcode] = AttributeDictionary(
                    reads=barcode_reads, sample=sample, sequences=1)
        analysis['coverage'] = cum_reads
        return analysis
Exemplo n.º 12
0
    def verify(self,fastqs,qc_protocol,fastq_screens=None,
               cellranger_version=None,cellranger_refdata=None,
               cellranger_use_multi_config=None):
        """
        Verify QC outputs for Fastqs against specified protocol

        Arguments:
          fastqs (list): list of Fastqs to verify outputs for
          qc_protocol (str): QC protocol to verify against
          fastq_screens (list): list of panel names to verify
            FastqScreen outputs against
          cellranger_version (str): specific version of 10x
            package to check for
          cellranger_refdata (str): specific 10x reference
            dataset to check for
          cellranger_use_multi_config (bool): if True then
            cellranger count verification will attempt to
            use data (GEX samples and reference dataset) from
            the '10x_multi_config.csv' file

        Returns:
          Boolean: True if all expected outputs are present,
            False otherwise.
        """
        # Look up protocol definition
        reads,qc_modules = fetch_protocol_definition(qc_protocol)

        # Sample names
        samples = set()
        for fq in fastqs:
            samples.add(self.fastq_attrs(fq).sample_name)
        samples = sorted(list(samples))

        # Default parameters for verification
        default_params = dict(
            fastqs=fastqs,
            samples=samples,
            seq_data_reads=reads.seq_data,
            qc_reads=reads.qc,
            fastq_screens=fastq_screens,
            cellranger_version=cellranger_version,
            cellranger_refdata=cellranger_refdata,
            cellranger_use_multi_config=cellranger_use_multi_config
        )

        # Perform verification
        verified = dict()
        params_for_module = dict()

        for qc_module in qc_modules:

            # Handle QC module specification
            qc_module,module_params = parse_qc_module_spec(qc_module)

            # Store parameters for reporting
            params_for_module[qc_module] = dict(**module_params)

            # Initialise up parameters for this module
            params = AttributeDictionary(**default_params)

            # Override parameters from module definition
            # parameter list
            for p in module_params:
                params[p] = module_params[p]

            # Verify outputs for this QC module
            verified[qc_module] = self.verify_qc_module(qc_module,
                                                        **params)

        # Report parameters and status of checks
        parameter_template_str = "{parameter:21s}: {value}"
        qc_module_template_str = "{name:21s}: {status:4s}{params}"
        print("-"*(10+len(self.qc_dir)))
        print("QC dir  : %s" % self.qc_dir)
        print("Protocol: %s" % qc_protocol)
        print("-"*(10+len(self.qc_dir)))
        print("Parameters:")
        for p in default_params:
            if p == 'fastqs':
                fqs = ['.../%s' % os.path.basename(fq)
                       for fq in default_params[p]]
                if not fqs:
                    print(parameter_template_str.format(parameter=p,
                                                        value=''))
                else:
                    print(parameter_template_str.format(parameter=p,
                                                        value=fqs[0]))
                    for fq in fqs[1:]:
                        print(parameter_template_str.format(parameter='',
                                                            value=fq))
            elif p == 'samples':
                smpls = default_params[p]
                if not smpls:
                    print(parameter_template_str.format(parameter=p,
                                                        value=''))
                else:
                    print(parameter_template_str.format(parameter=p,
                                                        value=smpls[0]))
                    for smpl in smpls[1:]:
                        print(parameter_template_str.format(parameter='',
                                                            value=smpl))
            elif p == 'cellranger_refdata':
                refdata = default_params[p]
                print(parameter_template_str.format(
                    parameter=p,
                    value=('.../%s' % os.path.basename(refdata)
                           if refdata else refdata)))
            else:
                print(parameter_template_str.format(parameter=p,
                                                    value=default_params[p]))
        print("-"*27)
        for name in verified:
            print(qc_module_template_str.format(
                name=name,
                status=('PASS' if verified[name] else 'FAIL'),
                params=(" %s" % params_for_module[name]
                        if params_for_module[name] else '')))
        status = all([verified[m] for m in verified])
        print("-"*27)
        print(qc_module_template_str.format(
            name="QC STATUS",
            status=('PASS' if status else 'FAIL'),
            params=''))
        print("-"*27)

        # Return verification status
        return status
Exemplo n.º 13
0
    if 'Nreads_contaminant_filtered' in cols:
        contaminant_filtered = True
    else:
        logging.warning("No stats on contaminant filtering")
        contaminant_filtered = False

    # Rename the '#Barcodes' and '%reads_poly_g' columns
    df.rename(columns={
        '#Barcode': 'Barcode',
        '%reads_poly_g': 'percent_poly_g'
    },
              inplace=True)
    print df.head()

    # Gather the data
    data = AttributeDictionary()

    # Total reads
    data['total_reads'] = df['Nreads'].sum()

    # Total assigned reads
    df = df.drop(df[df['Barcode'] == 'Unassigned'].index)
    data['total_assigned_reads'] = df['Nreads'].sum()
    # Mean and median reads per barcode
    data['median_read_count'] = df['Nreads'].median()
    data['mean_read_count'] = df['Nreads'].mean()
    data['std_read_count'] = df['Nreads'].std()
    # Number of barcodes (total and assigned)
    data['total_barcodes'] = len(df)
    data['assigned_barcodes'] = len(df[df['Nreads'] > 0])
Exemplo n.º 14
0
    def __init__(self,settings_file=None):
        """
        Create new Settings instance

        If 'settings_file' is specified then this should be the
        full path to an appropriately formatted '.ini' file.

        Otherwise the class will attempt to locate an appropriate
        file to use.
        
        """
        # Initialise list of sections
        self._sections = []
        # Locate settings file
        if settings_file is None:
            self.settings_file = locate_settings_file(create_from_sample=False)
        else:
            self.settings_file = os.path.abspath(settings_file)
        # Import site-specific settings from local version
        config = Config()
        if self.settings_file:
            config.read(self.settings_file)
        else:
            # Look for sample settings file
            config.read(os.path.join(get_config_dir(),'settings.ini.sample'))
        # General parameters
        self.add_section('general')
        default_runner = config.get('general','default_runner',
                                    'SimpleJobRunner')
        self.general['default_runner'] = config.getrunner('general',
                                                          'default_runner',
                                                          'SimpleJobRunner')
        self.general['max_concurrent_jobs'] = config.getint('general',
                                                            'max_concurrent_jobs',12)
        # modulefiles
        self.add_section('modulefiles')
        self.modulefiles['make_fastqs'] = config.get('modulefiles','make_fastqs')
        self.modulefiles['run_qc'] = config.get('modulefiles','run_qc')
        self.modulefiles['process_icell8'] = config.get('modulefiles','process_icell8')
        # bcl2fastq
        self.add_section('bcl2fastq')
        self.bcl2fastq = self.get_bcl2fastq_config('bcl2fastq',config)
        # qc
        self.add_section('qc')
        self.qc['nprocessors'] = config.getint('qc','nprocessors',1)
        self.qc['fastq_screen_subset'] = config.getint('qc',
                                                       'fastq_screen_subset',
                                                       100000)
        # Sequencing platform-specific defaults
        self.add_section('platform')
        for section in filter(lambda x: x.startswith('platform:'),
                              config.sections()):
            platform = section.split(':')[1]
            self.platform[platform] = self.get_bcl2fastq_config(section,config)
        # Handle deprecated bcl2fastq settings
        for platform in ('hiseq','miseq','nextseq'):
            if config.has_option('bcl2fastq',platform):
                logging.warning("Deprecated setting in [bcl2fastq]: '%s'"
                                % platform)
            try:
                bcl2fastq = self.platform[platform]['bcl2fastq']
            except KeyError:
                bcl2fastq = config.get('bcl2fastq',platform)
                if bcl2fastq is None:
                    continue
                logging.warning("Setting 'bcl2fastq' in '[platform:%s]' to '%s'"
                                % (platform,bcl2fastq))
                if platform not in self.platform:
                    self.platform[platform] = AttributeDictionary()
                self.platform[platform]['bcl2fastq'] = bcl2fastq
        # icell8
        self.add_section('icell8')
        self.icell8['aligner'] = config.get('icell8','aligner')
        self.icell8['batch_size'] = config.getint('icell8','batch_size',5000000)
        self.icell8['mammalian_conf_file'] = config.get('icell8',
                                                        'mammalian_conf_file')
        self.icell8['contaminants_conf_file'] = config.get('icell8',
                                                           'contaminants_conf_file')
        self.icell8['nprocessors_contaminant_filter'] = config.getint('icell8','nprocessors_contaminant_filter',1)
        self.icell8['nprocessors_statistics'] = config.getint('icell8','nprocessors_statistics',1)
        # 10xgenomics
        self.add_section('10xgenomics')
        self['10xgenomics']['cellranger_jobmode'] = config.get('10xgenomics',
                                                               'cellranger_jobmode',
                                                               'sge')
        self['10xgenomics']['cellranger_mempercore'] = config.getint('10xgenomics','cellranger_mempercore',5)
        self['10xgenomics']['cellranger_jobinterval'] = config.getint('10xgenomics','cellranger_jobinterval',100)
        # fastq_stats
        self.add_section('fastq_stats')
        self.fastq_stats['nprocessors'] = config.getint('fastq_stats','nprocessors',1)
        # Define runners for specific jobs
        self.add_section('runners')
        for name in ('bcl2fastq',
                     'qc',
                     'stats',
                     'rsync',
                     'icell8',
                     'icell8_contaminant_filter',
                     'icell8_statistics',):
            self.runners[name] = config.getrunner('runners',name,
                                                  default_runner)
        # Information for archiving analyses
        # dirn should be a directory in the form [[user@]host:]path]
        self.add_section('archive')
        self.archive['dirn'] = config.get('archive','dirn',None)
        self.archive['log'] = config.get('archive','log',None)
        self.archive['group'] = config.get('archive','group',None)
        self.archive['chmod'] = config.get('archive','chmod',None)
        # Information for uploading QC reports
        # dirn should be a directory in the form [[user@]host:]path]
        self.add_section('qc_web_server')
        self.qc_web_server['dirn'] = config.get('qc_web_server','dirn',None)
        self.qc_web_server['url'] = config.get('qc_web_server','url',None)
        self.qc_web_server['use_hierarchy'] = config.getboolean(
            'qc_web_server','use_hierarchy')
        self.qc_web_server['exclude_zip_files'] = config.getboolean(
            'qc_web_server','exclude_zip_files')
Exemplo n.º 15
0
 def args(self):
     """
     Fetch parameters supplied to the instance
     """
     return AttributeDictionary(**self._callargs)