Exemplo n.º 1
0
def report_summary(ap):
    """Generate summary report suitable for bioinformaticians

    Generates a multi-line report which gives general information
    about the run, plus one-line summaries for each project, plus
    any additional information that has been recorded.

    The general information includes:

    - Platform
    - Run name
    - Run reference id
    - Processing software
    - Assay (i.e. sequencing kit)

    For each project:

    - Project subdirectory
    - Researcher (aka user)
    - PI
    - Application (aka library type)
    - Single cell prep platform (e.g. ICell8)
    - Organism
    - Number of samples

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the
        analysis directory to be reported on

    Returns:
      String with the report text.
    """
    # Default items to report
    report_items = [
        'Run name',
        'Reference',
        'Platform',
        'Directory',
        'Endedness',
        'Bcl2fastq',
    ]
    # Gather information
    analysis_dir = analysis.AnalysisDir(ap.analysis_dir)
    datestamp = None
    instrument = None
    run_number = None
    run_name = ap.run_name
    try:
        datestamp, instrument, run_number = IlluminaData.split_run_name(
            run_name)
    except Exception, ex:
        logger.warning("Unable to extract information from run name '%s'" \
                       % run_name)
        logger.warning("Exception: %s" % ex)
Exemplo n.º 2
0
def run_reference_id(run_name, platform=None, facility_run_number=None):
    """Return a run reference id e.g. 'HISEQ_140701/242#22'

    The run reference code is a code that identifies the sequencing
    run, and has the general form:

    PLATFORM_DATESTAMP[/INSTRUMENT_RUN_NUMBER]#FACILITY_RUN_NUMBER

    - PLATFORM is always uppercased e.g. HISEQ, MISEQ, GA2X
    - DATESTAMP is the YYMMDD code e.g. 140701
    - INSTRUMENT_RUN_NUMBER is the run number that forms part of the
      run directory e.g. for '140701_SN0123_0045_000000000-A1BCD'
      it is '45'
    - FACILITY_RUN_NUMBER is the run number that has been assigned
      by the facility

    Note that the instrument run number is only used if it differs
    from the facility run number.

    If the platform isn't supplied then the instrument name is
    used instead, e.g.:

    'SN0123_140701/242#22'

    If the run name can't be split into components then the
    general form will be:

    [PLATFORM_]RUN_NAME[#FACILITY_RUN_NUMBER]

    depending on whether platform and/or facility run number have
    been supplied. For example for a run called 'rag_05_2017':

    'MISEQ_rag_05_2017#90'

    Arguments:
      run_name (str): the run name (can be a path)
      platform (str): the platform name (optional)
      facility_run_number (int): the run number assigned by the
        local facility (can be different from the instrument
        run number) (optional)
    """
    # Extract information from run name
    run_name = os.path.basename(os.path.normpath(run_name))
    try:
        datestamp, instrument, run_number = IlluminaData.split_run_name(
            run_name)
    except Exception, ex:
        logger.warning("Unable to extract information from run name '%s'" \
                       % run_name)
        logger.warning("Exception: %s" % ex)
        instrument = None
        date_stamp = None
        run_number = None
Exemplo n.º 3
0
def report_projects(ap):
    """Generate one line reports suitable for pasting into spreadsheet

    Generate one-line report for each each project with tab-separated
    data items, suitable for injection into a spreadsheet.

    Each line has the following information:

    - Run id e.g. HISEQ_140328
    - Run number
    - Source
    - Date
    - User
    - PI
    - Application
    - Single Cell Platform
    - Organism
    - Platform
    - #Samples
    - #Cells
    - PE (yes/no)
    - Samples

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the
        analysis directory to be reported on
        
    Returns:
      String with the report text.
    """
    # Acquire data
    analysis_dir = utils.AnalysisDir(ap.analysis_dir)
    # General information
    run_name = ap.run_name
    try:
        datestamp, instrument, run_number = IlluminaData.split_run_name(
            run_name)
        run_number = run_number.lstrip('0')
    except Exception, ex:
        logger.warning("Unable to extract information from run name '%s'" \
                       % run_name)
        logger.warning("Exception: %s" % ex)
        date_stamp = ''
        run_number = ''
def fetch_value(ap, project, field):
    """
    Return the value of the supplied field

    Given a field name, return the value determined from
    the data in the supplied AutoProcessor and
    AnalysisProject instances.

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the
        analysis directory to be reported on
      project (AnalysisProject): project to report on
      field (str): field name to return value of

    Returns:
      String: value of supplied field.
    """
    # Convenience variable for project info
    try:
        info = project.info
    except AttributeError:
        info = None
    # Generate value for supplied field name
    if field == 'datestamp':
        return IlluminaData.split_run_name(ap.run_name)[0]
    elif field == 'run_id':
        return ap.run_reference_id
    elif field == 'run_number':
        return (''
                if not ap.metadata.run_number else str(ap.metadata.run_number))
    elif field == 'source' or field == 'data_source':
        return ('' if not ap.metadata.source else ap.metadata.source)
    elif field == 'analysis_dir' or field == 'path':
        return ap.params.analysis_dir
    elif field == 'project' or field == 'project_name':
        return project.name
    elif field == 'user':
        return ('' if not info.user else info.user)
    elif field == 'PI' or field == 'pi':
        return ('' if not info.PI else info.PI)
    elif field == 'application' or field == 'library_type':
        return ('' if not info.library_type else info.library_type)
    elif field == 'single_cell_platform':
        return ('' if not info.single_cell_platform else
                info.single_cell_platform)
    elif field == 'organism':
        return ('' if not info.organism else info.organism)
    elif field == 'sequencer_platform' or field == 'platform':
        return ('' if not ap.metadata.platform else str(
            ap.metadata.platform).upper())
    elif field == 'sequencer_model':
        return ('' if not ap.metadata.sequencer_model else
                ap.metadata.sequencer_model)
    elif field == 'no_of_samples' or field == '#samples':
        return str(len(project.samples))
    elif field == 'no_of_cells' or field == '#cells':
        return ('' if not info.number_of_cells else str(info.number_of_cells))
    elif field == 'paired_end':
        return ('yes' if ap.paired_end else 'no')
    elif field == 'sample_names' or field == 'samples':
        return project.prettyPrintSamples()
    elif field == 'null' or field == '':
        return ''
    else:
        raise KeyError("'%s': unrecognised field for reporting" % field)
def report_summary(ap):
    """Generate summary report suitable for bioinformaticians

    Generates a multi-line report which gives general information
    about the run, plus one-line summaries for each project, plus
    any additional information that has been recorded.

    The general information includes:

    - Platform
    - Run name
    - Run reference id
    - Sequencer model
    - Processing software

    For each project:

    - Project subdirectory
    - Researcher (aka user)
    - PI
    - Application (aka library type)
    - Single cell prep platform (e.g. ICell8)
    - Organism
    - Number of samples

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the
        analysis directory to be reported on

    Returns:
      String with the report text.
    """
    # Default items to report
    report_items = [
        'Run name',
        'Reference',
        'Platform',
        'Sequencer',
        'Directory',
        'Endedness',
        'Bcl2fastq',
    ]
    # Gather information
    analysis_dir = analysis.AnalysisDir(ap.analysis_dir)
    datestamp = None
    instrument = None
    run_number = None
    run_name = ap.run_name
    try:
        datestamp, instrument, run_number = IlluminaData.split_run_name(
            run_name)
    except Exception as ex:
        logger.warning("Unable to extract information from run name '%s'" \
                       % run_name)
        logger.warning("Exception: %s" % ex)
    if ap.metadata.platform is not None:
        platform = ap.metadata.platform.upper()
    else:
        platform = 'unknown'
    if ap.metadata.run_number is not None:
        run_number = ap.metadata.run_number
    # Processing software information
    try:
        processing_software = ast.literal_eval(ap.metadata.processing_software)
    except ValueError:
        processing_software = dict()
    if not processing_software:
        # Fallback to legacy metadata items
        try:
            processing_software['bcl2fastq'] = ast.literal_eval(
                ap.metadata.bcl2fastq_software)
        except ValueError:
            pass
        try:
            processing_software['cellranger'] = ast.literal_eval(
                ap.metadata.cellranger_software)
        except ValueError:
            pass
    for pkg in ('cellranger', 'cellranger-atac'):
        if pkg in processing_software:
            report_items.append(pkg.title())
    # Generate report text
    report = []
    # Report header
    if datestamp and instrument and run_number:
        title = "%s run #%s datestamped %s" % (platform, run_number, datestamp)
    else:
        title = "%s" % os.path.basename(ap.analysis_dir)
    report.append("%s\n%s" % (title, '=' * len(title)))
    # General information
    field_width = max([len(i) for i in report_items])
    for item in report_items:
        # Get the value for each item
        if item == 'Run name':
            value = run_name
        elif item == 'Reference':
            value = ap.run_reference_id
        elif item == 'Platform':
            value = platform
        elif item == 'Sequencer':
            value = ap.metadata.sequencer_model
        elif item == 'Directory':
            value = ap.params.analysis_dir
        elif item == 'Endedness':
            value = ('Paired end' if analysis_dir.paired_end else 'Single end')
        elif item == 'Bcl2fastq':
            if 'bcl2fastq' in processing_software:
                value = "%s %s" % (processing_software['bcl2fastq'][1],
                                   processing_software['bcl2fastq'][2])
            else:
                value = 'Unknown'
        elif item == 'Cellranger':
            if 'cellranger' in processing_software:
                value = "%s %s" % (processing_software['cellranger'][1],
                                   processing_software['cellranger'][2])
            else:
                value = 'Unknown'
        elif item == 'Cellranger-Atac':
            if 'cellranger-atac' in processing_software:
                value = "%s %s" % (processing_software['cellranger-atac'][1],
                                   processing_software['cellranger-atac'][2])
            else:
                value = 'Unknown'
        else:
            raise Exception("Unknown reporting item '%s'" % item)
        # Append a line reporting the value
        report.append("%s%s: %s" % (item, ' ' *
                                    (field_width - len(item)), value))
    report.append("")
    # Projects
    rows = []
    comments = bcf_utils.OrderedDictionary()
    if analysis_dir.n_projects != 0:
        report.append("%d project%s:" %
                      (analysis_dir.n_projects,
                       '' if analysis_dir.n_projects == 1 else 's'))
        data_items = ('user', 'PI', 'library_type', 'single_cell_platform',
                      'number_of_cells', 'organism')
        for project in analysis_dir.projects:
            project_data = dict(project=project.name)
            for item in data_items:
                value = project.info[item]
                project_data[item] = value if value not in ('.','?') else \
                                     '<unspecified %s>' % item.lower()
            library = project_data['library_type']
            if project_data['single_cell_platform'] is not None:
                library += " (%s)" % project_data['single_cell_platform']
            samples = "%d sample%s" % (len(
                project.samples), 's' if len(project.samples) != 1 else '')
            if project_data['number_of_cells'] is not None:
                samples += "/%d cell%s" % (
                    int(project_data['number_of_cells']),
                    's' if int(project_data['number_of_cells']) != 1 else '')
            rows.append(("- '%s':" % project_data['project'],
                         project_data['user'], project_data['organism'],
                         library, samples, "(PI %s)" % project_data['PI']))
            if project.info.comments:
                comments[project.name] = project.info.comments
        report.append(utils.pretty_print_rows(rows))
    else:
        # No projects - try loading data from unaligned dir
        try:
            illumina_data = ap.load_illumina_data()
            report.append("No projects found; '%s' directory contains "
                          "the following data:\n" % ap.params.unaligned_dir)
            for project in illumina_data.projects:
                rows.append(("- '%s':" % project.name, "%s sample%s" %
                             (len(project.samples),
                              's' if len(project.samples) != 1 else '')))
            report.append(utils.pretty_print_rows(rows))
        except IlluminaData.IlluminaDataError as ex:
            report.append("No projects found")
    # Additional comments/notes
    if comments:
        width = max([len(x) for x in comments])
        report.append("")
        report.append("Additional notes/comments:")
        for project in comments:
            first_line = True
            for line in bcf_utils.split_into_lines(comments[project],
                                                   70 - width):
                if first_line:
                    report.append("- %s%s: %s" %
                                  (project, ' ' *
                                   (width - len(project)), line))
                    first_line = False
                else:
                    report.append("  %s  %s" % (' ' * width, line))
    return '\n'.join(report)
Exemplo n.º 6
0
    def __init__(self, analysis_dir):
        """Create a new AnalysisDir instance for a specified directory

        Arguments:
          analysis_dir: name (and path) to analysis directory

        """
        # Store location
        self._analysis_dir = os.path.abspath(analysis_dir)
        self._name = os.path.basename(analysis_dir)
        self._bcl2fastq_dirs = []
        self._project_dirs = []
        self._extra_dirs = []
        self.sequencing_data = []
        self.projects = []
        self.undetermined = None
        # Metadata
        self.metadata = AnalysisDirMetadata()
        try:
            metadata_file = os.path.join(self._analysis_dir, "metadata.info")
            self.metadata.load(metadata_file)
        except Exception as ex:
            logger.warning("Failed to load metadata file %s: %s" %
                           (metadata_file, ex))
            logger.warning("Attempting to load parameter file")
            try:
                params = AnalysisDirParameters()
                parameter_file = os.path.join(self._analysis_dir,
                                              "auto_process.info")
                params.load(parameter_file, strict=False)
                # Attempt to acquire values from parameters
                for param in ('platform', 'run_number', 'source', 'assay'):
                    if param not in params:
                        print "-- %s: missing" % param
                        continue
                    print "-- %s: setting to '%s'" % (param, params[param])
                    self.metadata[param] = params[param]
            except Exception as ex:
                # No parameter file either
                logger.warning("Failed to load parameters: %s" % ex)
                logger.warning("Perhaps this is not an auto_process project?")
                raise ex
        # Projects metadata
        try:
            self.projects_metadata = ProjectMetadataFile(
                os.path.join(self._analysis_dir, "projects.info"))
        except Exception as ex:
            logger.warning("Failed to load projects metadata: %s" % ex)
            self.projects_metadata = None
        # Run name
        try:
            self.run_name = self.metadata.run
        except AttributeError:
            self.run_name = self._analysis_dir[0:-len('_analysis')]
        self.run_name = os.path.basename(self.run_name)
        self.date_stamp,\
            self.instrument_name,\
            self.instrument_run_number = IlluminaData.split_run_name(
                self.run_name)
        # Look for outputs from bclToFastq and analysis projects
        logger.debug("Examining subdirectories of %s" % self._analysis_dir)
        for dirn in bcf_utils.list_dirs(self._analysis_dir):
            # Look for sequencing data
            try:
                data = IlluminaData.IlluminaData(self._analysis_dir,
                                                 unaligned_dir=dirn)
                logger.debug("- %s: sequencing data" % dirn)
                self._bcl2fastq_dirs.append(dirn)
                self.sequencing_data.append(data)
                continue
            except IlluminaData.IlluminaDataError:
                pass
            except Exception as ex:
                logger.warning("Exception when attempting to load "
                               "subdir '%s' as CASAVA/bcl2fastq output "
                               "(ignored): %s" % (dirn, ex))
            # Look for analysis data
            data = AnalysisProject(dirn, os.path.join(self._analysis_dir,
                                                      dirn))
            if data.is_analysis_dir:
                if dirn == 'undetermined':
                    logger.debug("- %s: undetermined indexes" % dirn)
                    self.undetermined = data
                else:
                    # Check against projects.info, if possible
                    try:
                        if not self.projects_metadata.lookup('Project', dirn):
                            logger.debug("- %s: not in projects.info" % dirn)
                            self._extra_dirs.append(dirn)
                            continue
                    except AttributeError:
                        pass
                    logger.debug("- %s: project directory" % dirn)
                    self._project_dirs.append(dirn)
                    self.projects.append(data)
                continue
            else:
                # Unidentified contents
                self._extra_dirs.append(dirn)
                logger.debug("- %s: unknown" % dirn)