Exemplo n.º 1
0
    def test_set_cell_count_fails_for_project_with_no_metadata(self):
        """
        set_cell_count_for_project: raises exception for project with no metadata
        """
        # Set up mock project
        project_dir = self._make_mock_analysis_project(None, None)
        # Add metrics_summary.csv
        counts_dir = os.path.join(project_dir, "qc", "cellranger_count",
                                  "5.0.1", "refdata-gex-GRCh38-2020-A", "PJB1",
                                  "outs")
        mkdirs(counts_dir)
        metrics_summary_file = os.path.join(counts_dir, "metrics_summary.csv")
        with open(metrics_summary_file, 'wt') as fp:
            fp.write(METRICS_SUMMARY)
        # Add QC info file
        with open(os.path.join(project_dir, "qc", "qc.info"), 'wt') as fp:
            fp.write(
                """Cellranger reference datasets\t/data/refdata-gex-GRCh38-2020-A
Cellranger version\t5.0.1
""")
        # Check initial cell count
        print("Checking number of cells")
        self.assertEqual(
            AnalysisProject("PJB1", project_dir).info.number_of_cells, None)
        # Attempting to update the cell counts should raise
        # NotImplementedError
        self.assertRaises(NotImplementedError, set_cell_count_for_project,
                          project_dir)
        # Check cell count wasn't updated
        self.assertEqual(
            AnalysisProject("PJB1", project_dir).info.number_of_cells, None)
Exemplo n.º 2
0
    def test_set_cell_count_project_missing_library_type(self):
        """
        set_cell_count_for_project: test for scRNA-seq when library not set
        """
        # Set up mock project with library type not set
        project_dir = self._make_mock_analysis_project(
            "10xGenomics Chromium 3'v3", None)
        # Add metrics_summary.csv
        counts_dir = os.path.join(project_dir, "qc", "cellranger_count",
                                  "5.0.1", "refdata-gex-GRCh38-2020-A", "PJB1",
                                  "outs")
        mkdirs(counts_dir)
        metrics_summary_file = os.path.join(counts_dir, "metrics_summary.csv")
        with open(metrics_summary_file, 'w') as fp:
            fp.write(METRICS_SUMMARY)
        # Add QC info file
        with open(os.path.join(project_dir, "qc", "qc.info"), 'wt') as fp:
            fp.write(
                """Cellranger reference datasets\t/data/refdata-gex-GRCh38-2020-A
Cellranger version\t5.0.1
""")
        # Check initial cell count
        print("Checking number of cells")
        self.assertEqual(
            AnalysisProject("PJB1", project_dir).info.number_of_cells, None)
        # Update the cell counts
        print("Updating number of cells")
        set_cell_count_for_project(project_dir)
        # Check updated cell count
        self.assertEqual(
            AnalysisProject("PJB1", project_dir).info.number_of_cells, 2272)
Exemplo n.º 3
0
    def test_set_cell_count_for_multiome_gex_project(self):
        """
        set_cell_count_for_project: test for single cell multiome GEX
        """
        # Set up mock project
        project_dir = self._make_mock_analysis_project(
            "10xGenomics Single Cell Multiome", "GEX")
        # Add metrics_summary.csv
        counts_dir = os.path.join(project_dir, "qc", "cellranger_count",
                                  "1.0.0",
                                  "refdata-cellranger-arc-GRCh38-2020-A",
                                  "PJB1", "outs")
        mkdirs(counts_dir)
        summary_file = os.path.join(counts_dir, "summary.csv")
        with open(summary_file, 'w') as fp:
            fp.write(MULTIOME_SUMMARY)
        # Add QC info file
        with open(os.path.join(project_dir, "qc", "qc.info"), 'wt') as fp:
            fp.write(
                """Cellranger reference datasets\t/data/refdata-cellranger-arc-GRCh38-2020-A
Cellranger version\t1.0.0
""")
        # Check initial cell count
        print("Checking number of cells")
        self.assertEqual(
            AnalysisProject("PJB1", project_dir).info.number_of_cells, None)
        # Update the cell counts
        print("Updating number of cells")
        set_cell_count_for_project(project_dir)
        # Check updated cell count
        self.assertEqual(
            AnalysisProject("PJB1", project_dir).info.number_of_cells, 744)
Exemplo n.º 4
0
    def test_set_cell_count_for_atac_project_2_0_0(self):
        """
        set_cell_count_for_project: test for scATAC-seq (Cellranger ATAC 2.0.0)
        """
        # Set up mock project
        project_dir = self._make_mock_analysis_project(
            "10xGenomics Single Cell ATAC", "scATAC-seq")
        # Add metrics_summary.csv
        counts_dir = os.path.join(
            project_dir, "qc", "cellranger_count", "2.0.0",
            "refdata-cellranger-atac-GRCh38-2020-A-2.0.0", "PJB1", "outs")
        mkdirs(counts_dir)
        summary_file = os.path.join(counts_dir, "summary.csv")
        with open(summary_file, 'w') as fp:
            fp.write(ATAC_SUMMARY_2_0_0)
        # Add QC info file
        with open(os.path.join(project_dir, "qc", "qc.info"), 'wt') as fp:
            fp.write(
                """Cellranger reference datasets\t/data/refdata-cellranger-atac-GRCh38-2020-A-2.0.0
Cellranger version\t2.0.0
""")
        # Check initial cell count
        print("Checking number of cells")
        self.assertEqual(
            AnalysisProject("PJB1", project_dir).info.number_of_cells, None)
        # Update the cell counts
        print("Updating number of cells")
        set_cell_count_for_project(project_dir)
        # Check updated cell count
        self.assertEqual(
            AnalysisProject("PJB1", project_dir).info.number_of_cells, 3582)
Exemplo n.º 5
0
    def test_set_cell_count_for_cellplex_project(self):
        """
        set_cell_count_for_project: test for multiplexed data (CellPlex)
        """
        # Set up mock project
        project_dir = self._make_mock_analysis_project(
            "10xGenomics Chromium 3'v3", "CellPlex")
        # Build mock cellranger multi output directory
        multi_dir = os.path.join(project_dir, "qc", "cellranger_multi",
                                 "6.0.0",
                                 "refdata-cellranger-gex-GRCh38-2020-A",
                                 "outs")
        mkdirs(multi_dir)
        for sample in (
                "PBA",
                "PBB",
        ):
            sample_dir = os.path.join(multi_dir, "per_sample_outs", sample)
            mkdirs(sample_dir)
            summary_file = os.path.join(sample_dir, "metrics_summary.csv")
            with open(summary_file, 'wt') as fp:
                fp.write(CELLPLEX_METRICS_SUMMARY)
            web_summary = os.path.join(sample_dir, "web_summary.html")
            with open(web_summary, 'wt') as fp:
                fp.write("Placeholder for web_summary.html\n")
        # Add QC info file
        with open(os.path.join(project_dir, "qc", "qc.info"), 'wt') as fp:
            fp.write(
                """Cellranger reference datasets\t/data/refdata-cellranger-gex-GRCh38-2020-A
Cellranger version\t6.0.0
""")
        # Check initial cell count
        print("Checking number of cells")
        self.assertEqual(
            AnalysisProject("PJB1", project_dir).info.number_of_cells, None)
        # Update the cell counts
        print("Updating number of cells")
        set_cell_count_for_project(project_dir)
        # Check updated cell count
        self.assertEqual(
            AnalysisProject("PJB1", project_dir).info.number_of_cells, 10350)
Exemplo n.º 6
0
 def test_set_cell_count_project_missing_library_type_no_subdirs(self):
     """
     set_cell_count_for_project: test for scRNA-seq when library not set (old-style output)
     """
     # Set up mock project with library type not set
     project_dir = self._make_mock_analysis_project(
         "10xGenomics Chromium 3'v3", None)
     # Add metrics_summary.csv
     counts_dir = os.path.join(project_dir, "qc", "cellranger_count",
                               "PJB1", "outs")
     mkdirs(counts_dir)
     metrics_summary_file = os.path.join(counts_dir, "metrics_summary.csv")
     with open(metrics_summary_file, 'w') as fp:
         fp.write(METRICS_SUMMARY)
     # Check initial cell count
     print("Checking number of cells")
     self.assertEqual(
         AnalysisProject("PJB1", project_dir).info.number_of_cells, None)
     # Update the cell counts
     print("Updating number of cells")
     set_cell_count_for_project(project_dir)
     # Check updated cell count
     self.assertEqual(
         AnalysisProject("PJB1", project_dir).info.number_of_cells, 2272)
def build_fastq_path_dir(project_dir):
    """
    Create directory mimicking output from cellranger mkfastq

    This function creates and populates a 'cellranger mkfastq'
    style 'fastq_path' directory from an autoprocess analysis
    project, which can then be used as input to 'cellranger
    count'.

    The new directory will be called 'cellranger_fastq_path'
    and will created in the project directory, and will be
    populated by links to the Fastq files in the project.

    Arguments:
      project_dir (str): path to the project directory in
        which to create the 'fastq_path' directory

    Returns:
      String: path to the 'cellranger_fastq_path' directory.
    """
    project = AnalysisProject(os.path.basename(project_dir.rstrip(os.sep)),
                              os.path.abspath(project_dir))
    fastq_path_dir = os.path.join(project.dirn,
                                  "cellranger_fastq_path")
    mkdirs(fastq_path_dir)
    mkdirs(os.path.join(fastq_path_dir,"Reports"))
    mkdirs(os.path.join(fastq_path_dir,"Stats"))
    fq_dir = os.path.join(fastq_path_dir,project.name)
    mkdirs(fq_dir)
    for fastq in project.fastqs:
        print fastq
        link_name = os.path.join(fq_dir,os.path.basename(fastq))
        if os.path.exists(link_name):
            logger.warning("%s: already exists" % link_name)
            continue
        target = os.path.relpath(fastq,fq_dir)
        logger.debug("Linking: %s -> %s" % (link_name,target))
        os.symlink(target,link_name)
    return fastq_path_dir
def cellranger_mkfastq(samplesheet,
                       primary_data_dir,
                       output_dir,
                       lanes=None,
                       cellranger_jobmode='local',
                       cellranger_maxjobs=None,
                       cellranger_mempercore=None,
                       cellranger_jobinterval=None,
                       cellranger_localcores=None,
                       cellranger_localmem=None,
                       log_dir=None,
                       dry_run=False,
                       project_metadata_file='projects.info'):
    """
    Wrapper for running 'cellranger mkfastq'

    Runs the 10xGenomics 'cellranger mkfastq' command to
    generate Fastqs from bcl files for Chromium single-cell
    data.

    Arguments:
      sample_sheet (str): path to input samplesheet with
        10xGenomics barcode indices
      primary_data_dir (str): path to the top-level
        directory holding the sequencing data
      output_dir (str): path to the output directory
      lanes (str): optional, specify the subset of lanes
        to process (default is to process all lanes
        in the run)
      cellranger_jobmode (str): specify the job mode to
        pass to cellranger (default: "local")
      cellranger_maxjobs (int): specify the maximum
        number of jobs to pass to cellranger (default:
        None)
      cellranger_mempercore (int): specify the memory
        per core (in Gb) to pass to cellranger (default:
        None)
      cellranger_jobinterval (int): specify the interval
        between launching jobs (in ms) to pass to
        cellranger (default: None)
      cellranger_localcores (int): maximum number of cores
        cellranger can request in jobmode 'local'
      cellranger_localmem (int): maximum memory cellranger
        can request in jobmode 'local'
      log_dir (str): path to a directory to write logs
        (default: current working directory)
      dry_run (bool): if True then only report actions
        that would be performed but don't run anything
      project_metadata_file (str): name of project
        metadata file to create/update with information
        on projects generated by cellranger (default:
        projects.info)

    Returns:
      Integer: exit code from the cellranger command.
    """
    # Make a log directory
    if not dry_run:
        if log_dir is None:
            log_dir = os.getcwd()
        log_dir = get_numbered_subdir("cellranger_mkfastq",
                                      parent_dir=log_dir,
                                      full_path=True)
        mkdirs(log_dir)
    # Run cellranger mkfastq
    retval = run_cellranger_mkfastq(
        samplesheet,
        primary_data_dir,
        output_dir,
        lanes=lanes,
        cellranger_jobmode=cellranger_jobmode,
        cellranger_maxjobs=cellranger_maxjobs,
        cellranger_mempercore=cellranger_mempercore,
        cellranger_jobinterval=cellranger_jobinterval,
        log_dir=log_dir,
        dry_run=dry_run)
    if not dry_run:
        # Update the project metadata file
        update_project_metadata(output_dir, project_metadata_file)
    return retval
Exemplo n.º 9
0
def run_cellranger_count(fastq_dir,
                         transcriptome,
                         cellranger_jobmode='sge',
                         cellranger_maxjobs=None,
                         cellranger_mempercore=None,
                         cellranger_jobinterval=None,
                         max_jobs=4,
                         log_dir=None,
                         dry_run=False,
                         summary_only=True):
    """
    Wrapper for running 'cellranger count'

    Runs the 10xGenomics 'cellranger count' command to
    perform single library analysis on Fastqs from
    Chromium single-cell samples.

    If the supplied 'fastq_dir' is a 'cellranger mkfastq'
    or 'bcl2fastq' output directory then the analysis
    will be run for each of the projects.

    Arguments:
      fastq_dir (str): path of the 'fastq_path' folder
        from 'cellranger mkfastq', or the output folder
        from 'bcl2fastq' (or with a similar structure),
        or any folder containing Fastq files
      transcriptome (str): path to the cellranger
        compatible transcriptome reference data
        directory
      cellranger_jobmode (str): specify the job mode to
        pass to cellranger (default: None)
      cellranger_maxjobs (int): specify the maximum
        number of jobs to pass to cellranger (default:
        None)
      cellranger_mempercore (int): specify the memory
        per core (in Gb) to pass to cellranger (default:
        None)
      cellranger_jobinterval (int): specify the interval
        between launching jobs (in ms) to pass to
        cellranger (default: None)
      max_jobs (int):
      log_dir (str): path to a directory to write logs
        (default: current working directory)
      dry_run (bool): if True then only report actions
        that would be performed but don't run anything
      summary_only (bool): if True then only collect
        the output 'web_summary.html' and
        'metrics_summary.csv' files, otherwise
        copy all outputs (warning: this can be very
        large)

    Returns:
      Integer: exit code from the cellranger command.
    """
    # Input data
    sample_names = {}
    try:
        illumina_data = IlluminaData(os.getcwd(), unaligned_dir=fastq_dir)
        for project in illumina_data.projects:
            sample_names[project.name] = []
            for sample in project.samples:
                sample_names[project.name].append(sample.name)
    except IlluminaDataError:
        logger.critical("Couldn't load data from '%s'" % fastq_dir)
        return 1
    print "Samples: %s" % sample_names
    projects = sample_names.keys()

    # Set up a scheduler
    sched_reporter = SchedulerReporter(
        job_start=
        "SCHEDULER: Started  #%(job_number)d: %(job_name)s:\n-- %(command)s",
        job_end="SCHEDULER: Finished #%(job_number)d: %(job_name)s")
    sched_reporter = SchedulerReporter()
    sched = SimpleScheduler(max_concurrent=max_jobs, reporter=sched_reporter)
    sched.start()

    # Make a log directory
    if not dry_run:
        if log_dir is None:
            log_dir = os.getcwd()
        log_dir = get_numbered_subdir("cellranger_count",
                                      parent_dir=log_dir,
                                      full_path=True)
        mkdirs(log_dir)

    # Submit the cellranger count jobs
    jobs = []
    for project in projects:
        print "Project: %s" % project
        for sample in sample_names[project]:
            print "Sample: %s" % sample
            # Check if outputs already exist
            count_dir = os.path.abspath(
                os.path.join(project, "cellranger_count", sample, "outs"))
            if os.path.isdir(count_dir):
                print "-- %s: outputs exist, nothing to do" % sample
                continue
            else:
                print "-- %s: setting up cellranger count" % sample
            # Set up job for this sample
            work_dir = os.path.abspath("tmp.cellranger_count.%s.%s" %
                                       (project, sample))
            mkdirs(work_dir)
            cmd = Command("cellranger", "count", "--id", sample, "--fastqs",
                          os.path.abspath(fastq_dir), "--sample", sample,
                          "--transcriptome", transcriptome)
            add_cellranger_args(cmd,
                                jobmode=cellranger_jobmode,
                                mempercore=cellranger_mempercore,
                                maxjobs=cellranger_maxjobs,
                                jobinterval=cellranger_jobinterval)
            print "Running: %s" % cmd
            if not dry_run:
                job = sched.submit(cmd,
                                   name="cellranger_count.%s.%s" %
                                   (project, sample),
                                   log_dir=log_dir,
                                   wd=work_dir)
                jobs.append(job)
    sched.wait()
    sched.stop()

    # If dry run then stop here
    if dry_run:
        return 0

    # Finished, check the exit status
    retval = 0
    for job in jobs:
        retval += job.exit_code
    if retval != 0:
        logger.critical("One or more jobs finished with non-zero " "exit code")
        return retval

    # Handle outputs
    for project in projects:
        print "Project: %s" % project
        for sample in sample_names[project]:
            print "Sample: %s" % sample
            # Destination for count output
            count_dir = os.path.abspath(
                os.path.join(project, "cellranger_count", sample))
            mkdirs(count_dir)
            # Copy the cellranger count outputs
            outs_dir = os.path.join(
                "tmp.cellranger_count.%s.%s" % (project, sample), sample,
                "outs")
            if not summary_only:
                # Collect all outputs
                print "Copying contents of %s to %s" % (outs_dir, count_dir)
                shutil.copytree(outs_dir, count_dir)
            else:
                # Only collect the web and csv summaries
                count_dir = os.path.join(count_dir, "outs")
                mkdirs(count_dir)
                for f in ("web_summary.html", "metrics_summary.csv"):
                    path = os.path.join(outs_dir, f)
                    if not os.path.exists(path):
                        logger.warning("%s: not found in %s" % (f, outs_dir))
                        retval = 1
                    else:
                        print "Copying %s from %s to %s" % (f, outs_dir,
                                                            count_dir)
                        shutil.copy(path, count_dir)
                # Stop if there was an error
                if retval != 0:
                    logger.critical("Some cellranger count outputs are "
                                    "missing")
                    return retval

    # Create a report and zip archive for each project
    pwd = os.getcwd()
    analysis_dir = os.path.basename(pwd)
    for project in projects:
        # Descend into project dir
        os.chdir(project)
        # Set up zip file
        report_zip = os.path.join("cellranger_count_report.%s.%s.zip" %
                                  (project, analysis_dir))
        zip_file = ZipArchive(report_zip,
                              prefix="cellranger_count_report.%s.%s" %
                              (project, analysis_dir))
        # Construct index page
        print "Making report for project %s" % project
        count_report = Document("%s: cellranger count" % project)
        count_report.add_css_rule(css_rules.QC_REPORT_CSS_RULES)
        summaries = count_report.add_section()
        summaries.add("Reports from cellranger count for each sample:")
        summary_links = List()
        for sample in sample_names[project]:
            # Link to summary for sample
            web_summary = os.path.join("cellranger_count", sample, "outs",
                                       "web_summary.html")
            print "Adding web summary (%s) for %s" % (web_summary, sample)
            summary_links.add_item(Link("%s" % sample, web_summary))
            # Add to the zip file
            zip_file.add_file(web_summary)
        summaries.add(summary_links)
        # Write the report and add to the zip file
        html_file = "cellranger_count_report.html"
        count_report.write(html_file)
        zip_file.add_file(html_file)
        # Finish
        zip_file.close()
        os.chdir(pwd)
    # Done
    return retval
Exemplo n.º 10
0
def make_fastqs(ap,
                protocol='standard',
                platform=None,
                unaligned_dir=None,
                sample_sheet=None,
                lanes=None,
                ignore_missing_bcl=False,
                ignore_missing_stats=False,
                skip_rsync=False,
                remove_primary_data=False,
                nprocessors=None,
                require_bcl2fastq_version=None,
                bases_mask=None,
                no_lane_splitting=None,
                minimum_trimmed_read_length=None,
                mask_short_adapter_reads=None,
                generate_stats=True,
                stats_file=None,
                per_lane_stats_file=None,
                analyse_barcodes=True,
                barcode_analysis_dir=None,
                skip_fastq_generation=False,
                only_fetch_primary_data=False,
                create_empty_fastqs=None,
                runner=None,
                cellranger_jobmode=None,
                cellranger_mempercore=None,
                cellranger_maxjobs=None,
                cellranger_jobinterval=None,
                cellranger_localcores=None,
                cellranger_localmem=None,
                cellranger_ignore_dual_index=False):
    """Create and summarise FASTQ files

    Wrapper for operations related to FASTQ file generation and analysis.
    The operations are typically:
 
    - get primary data (BCL files)
    - run bcl-to-fastq conversion
    - generate statistics

    If the number of processors and the job runner are not explicitly
    specified then these are taken from the settings for the bcl2fastq
    and the statistics generation steps, which may differ from each other.
    However if either of these values are set explicitly then the same
    values will be used for both steps.

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the analysis
        directory to create Fastqs for
      protocol (str): if set then specifies the protocol to use
        for fastq generation, otherwise use the 'standard' bcl2fastq
        protocol
      platform (str): if set then specifies the sequencing platform
        (otherwise platform will be determined from the primary data)
      unaligned_dir (str): if set then use this as the output directory
        for bcl-to-fastq conversion. Default is 'bcl2fastq' (unless
        an alternative is already specified in the config file)
      sample_sheet (str): if set then use this as the input samplesheet
      lanes (list): (optional) specify a list of lane numbers to
        use in the processing; lanes not in the list will be excluded
        (default is to include all lanes)
      nprocessors (int) : number of processors to run bclToFastq.py with
      ignore_missing_bcl (bool): if True then run bcl2fastq with
        --ignore-missing-bcl
      ignore_missing_stats (bool): if True then run bcl2fastq with
        --ignore-missing-stats
      skip_rsync (bool): if True then don't rsync primary data at the
        start of bcl2fastq conversion
      remove_primary_data (bool): if True then remove primary data at
        the end of bcl2fastq conversion (default is to keep it)
      generate_stats (bool): if True then (re)generate statistics file
        for fastqs
      analyse_barcodes (bool): if True then (re)analyse barcodes for
        fastqs
      require_bcl2fastq_version (str): (optional) specify bcl2fastq
        version to use. Should be a string of the form '1.8.4' or
        '>2.0'. Set to None to automatically determine required
        bcl2fastq version.
      bases_mask (str): if set then use this as an alternative bases
        mask setting
      no_lane_splitting (bool): if True then run bcl2fastq with
        --no-lane-splitting
      minimum_trimmed_read_length (int): if set then specify minimum
        length for reads after adapter trimming (shorter reads will
        be padded with Ns to make them long enough)
      mask_short_adapter_reads (int): if set then specify the minimum
        length of ACGT bases that must be present in a read after
        adapter trimming for it not to be masked completely
        with Ns.
      stats_file (str): if set then use this as the name of the output
        per-fastq stats file.
      per_lane_stats_file (str): if set then use this as the name of
        the output per-lane stats file.
      barcode_analysis_dir (str): if set then specifies path to the
        output directory for barcode analysis
      skip_fastq_generation (bool): if True then don't perform fastq
        generation
      only_fetch_primary_data (bool): if True then fetch primary data,
        don't do anything else
      create_empty_fastqs (bool): if True then create empty 'placeholder'
        fastq files for any missing fastqs after bcl2fastq
        (must have completed with zero exit status)
      runner (JobRunner): (optional) specify a non-default job runner
        to use for fastq generation
      cellranger_jobmode (str): (optional) job mode to run cellranger in
        (10xGenomics Chromium SC data only)
      cellranger_mempercore (int): (optional) memory assumed per core
        (in Gbs) (10xGenomics Chromium SC data only)
      cellranger_maxjobs (int): (optional) maxiumum number of concurrent
         jobs to run (10xGenomics Chromium SC data only)
      cellranger_jobinterval (int): (optional) how often jobs are
         submitted (in ms) (10xGenomics Chromium SC data only)
      cellranger_localcores (int): (optional) maximum number of cores
         cellranger can request in jobmode 'local' (10xGenomics Chromium
         SC data only)
      cellranger_localmem (int): (optional) maximum memory cellranger
         can request in jobmode 'local' (10xGenomics Chromium SC data
         only)
      cellranger_ignore_dual_index (bool): (optional) on a dual-indexed
         flowcell where the second index was not used for the 10x
         sample, ignore it (10xGenomics Chromium SC data only)
    """
    # Report protocol
    print "Protocol              : %s" % protocol
    if protocol not in MAKE_FASTQS_PROTOCOLS:
        raise Exception("Unknown protocol: '%s' (must be one of "
                        "%s)" % (protocol, ','.join([MAKE_FASTQS_PROTOCOLS])))
    # Unaligned dir
    if unaligned_dir is not None:
        ap.params['unaligned_dir'] = unaligned_dir
    elif ap.params['unaligned_dir'] is None:
        ap.params['unaligned_dir'] = 'bcl2fastq'
    print "Output dir            : %s" % ap.params.unaligned_dir
    # Sample sheet
    if sample_sheet is None:
        sample_sheet = ap.params.sample_sheet
    if not os.path.isabs(sample_sheet):
        sample_sheet = os.path.join(ap.analysis_dir, sample_sheet)
    if not os.path.isfile(sample_sheet):
        raise Exception("Missing sample sheet '%s'" % sample_sheet)
    ap.params['sample_sheet'] = sample_sheet
    print "Source sample sheet   : %s" % ap.params.sample_sheet
    # Check requested lanes are actually present
    print "Lanes                 : %s" % ('all' if lanes is None else ','.join(
        [str(l) for l in lanes]))
    if lanes is not None:
        s = IlluminaData.SampleSheet(ap.params.sample_sheet)
        if not s.has_lanes:
            raise Exception("Requested subset of lanes but "
                            "samplesheet doesn't contain any "
                            "lane information")
        samplesheet_lanes = list(set([l['Lane'] for l in s]))
        for l in lanes:
            if l not in samplesheet_lanes:
                raise Exception("Requested lane '%d' not present "
                                "in samplesheet" % l)
    # Make a temporary sample sheet
    if lanes:
        lanes_id = ".L%s" % ''.join([str(l) for l in lanes])
    else:
        lanes_id = ""
    sample_sheet = os.path.join(
        ap.tmp_dir,
        "SampleSheet%s.%s.csv" % (lanes_id, time.strftime("%Y%m%d%H%M%S")))
    make_custom_sample_sheet(ap.params.sample_sheet, sample_sheet, lanes=lanes)
    # Check the temporary sample sheet
    print "Checking temporary sample sheet"
    invalid_barcodes = SampleSheetLinter(
        sample_sheet_file=sample_sheet).has_invalid_barcodes()
    if invalid_barcodes:
        logger.error("Invalid barcodes detected")
        for line in invalid_barcodes:
            logger.critical("%s" % line)
    invalid_characters = SampleSheetLinter(
        sample_sheet_file=sample_sheet).has_invalid_characters()
    if invalid_characters:
        logger.critical("Invalid non-printing/non-ASCII characters "
                        "detected")
    if invalid_barcodes or invalid_characters:
        raise Exception("Errors detected in generated sample sheet")
    # Adjust verification settings for 10xGenomics Chromium SC
    # data if necessary
    verify_include_sample_dir = False
    if has_chromium_sc_indices(sample_sheet):
        if protocol in (
                '10x_chromium_sc',
                '10x_chromium_sc_atac',
        ):
            # Force inclusion of sample-name subdirectories
            # when verifying Chromium SC data
            print "Sample sheet includes Chromium SC indices"
            verify_include_sample_dir = True
        else:
            # Chromium SC indices detected but not using
            # 10x_chromium_sc protocol
            raise Exception("Detected 10xGenomics Chromium SC indices "
                            "in generated sample sheet but protocol "
                            "'%s' has been specified; use an "
                            "appropriate '10x_...' protocol for these "
                            "indices" % protocol)
    # Check for pre-existing Fastq outputs
    if verify_fastq_generation(ap,
                               unaligned_dir=ap.params.unaligned_dir,
                               lanes=lanes,
                               include_sample_dir=verify_include_sample_dir):
        print "Expected Fastq outputs already present"
        skip_rsync = True
        skip_fastq_generation = True
    # Check if there's anything to do
    if (skip_rsync and skip_fastq_generation) and \
       not (generate_stats or analyse_barcodes):
        print "Nothing to do"
        return
    # Log dir
    log_dir = 'make_fastqs'
    if protocol != 'standard':
        log_dir += "_%s" % protocol
    if lanes:
        log_dir += "_L%s" % ''.join([str(l) for l in sorted(lanes)])
    ap.set_log_dir(ap.get_log_subdir(log_dir))
    # Fetch primary data
    if not skip_rsync and not ap.params.acquired_primary_data:
        if get_primary_data(ap) != 0:
            logger.error("Failed to acquire primary data")
            raise Exception("Failed to acquire primary data")
        else:
            ap.params['acquired_primary_data'] = True
    if only_fetch_primary_data:
        return
    # Deal with platform information
    if not platform:
        platform = ap.metadata.platform
    # Do fastq generation using the specified protocol
    if not skip_fastq_generation:
        # Set primary data location and report info
        primary_data_dir = os.path.join(ap.params.primary_data_dir,
                                        os.path.basename(ap.params.data_dir))
        print "Primary data dir      : %s" % primary_data_dir
        try:
            illumina_run = IlluminaData.IlluminaRun(primary_data_dir,
                                                    platform=platform)
        except IlluminaData.IlluminaDataPlatformError as ex:
            logger.critical("Error loading primary data: %s" % ex)
            if platform is None:
                logger.critical("Try specifying platform using --platform?")
            else:
                logger.critical("Check specified platform is valid (or "
                                "omit --platform")
            raise Exception("Error determining sequencer platform")
        print "Platform              : %s" % illumina_run.platform
        print "Bcl format            : %s" % illumina_run.bcl_extension
        # Set platform in metadata
        ap.metadata['platform'] = illumina_run.platform
        # Bases mask
        if bases_mask is not None:
            ap.params['bases_mask'] = bases_mask
        bases_mask = ap.params.bases_mask
        print "Bases mask setting    : %s" % bases_mask
        if protocol not in (
                '10x_chromium_sc',
                '10x_chromium_sc_atac',
        ):
            if bases_mask == "auto":
                print "Determining bases mask from RunInfo.xml"
                bases_mask = get_bases_mask(illumina_run.runinfo_xml,
                                            sample_sheet)
                if not bases_mask_is_valid(bases_mask):
                    raise Exception("Invalid bases mask: '%s'" % bases_mask)
        # Do fastq generation according to protocol
        if protocol == 'icell8':
            # ICell8 data
            # Update bcl2fastq settings appropriately
            print "Updating read trimming and masking for ICell8"
            minimum_trimmed_read_length = 21
            mask_short_adapter_reads = 0
            # Reset the default bases mask
            bases_mask = IlluminaData.IlluminaRunInfo(
                illumina_run.runinfo_xml).bases_mask
            bases_mask = get_icell8_bases_mask(bases_mask,
                                               sample_sheet=sample_sheet)
            if not bases_mask_is_valid(bases_mask):
                raise Exception("Invalid bases mask: '%s'" % bases_mask)
            # Switch to standard protocol
            protocol = 'standard'
        if protocol == 'standard':
            # Standard protocol
            try:
                exit_code = bcl_to_fastq(
                    ap,
                    unaligned_dir=ap.params.unaligned_dir,
                    sample_sheet=sample_sheet,
                    primary_data_dir=primary_data_dir,
                    require_bcl2fastq=require_bcl2fastq_version,
                    bases_mask=bases_mask,
                    ignore_missing_bcl=ignore_missing_bcl,
                    ignore_missing_stats=ignore_missing_stats,
                    no_lane_splitting=no_lane_splitting,
                    minimum_trimmed_read_length=minimum_trimmed_read_length,
                    mask_short_adapter_reads=mask_short_adapter_reads,
                    nprocessors=nprocessors,
                    runner=runner)
            except Exception as ex:
                raise Exception("Bcl2fastq stage failed: '%s'" % ex)
        elif protocol == '10x_chromium_sc':
            # 10xGenomics Chromium SC
            if bases_mask == 'auto':
                bases_mask = None
            try:
                # Check we have cellranger
                cellranger = find_program('cellranger')
                if not cellranger:
                    raise Exception("No cellranger package found")
                cellranger_software_info = cellranger_info(cellranger)
                print "Using cellranger %s: %s" % \
                    (cellranger_software_info[-1],
                     cellranger)
                # Check we have bcl2fastq
                bcl2fastq = find_program('bcl2fastq')
                if not bcl2fastq:
                    raise Exception("No bcl2fastq package found")
                bcl2fastq = available_bcl2fastq_versions(
                    paths=(os.path.dirname(bcl2fastq), ), reqs='>=2.17')
                if not bcl2fastq:
                    raise Exception("No appropriate bcl2fastq software "
                                    "located")
                bcl2fastq = bcl2fastq[0]
                bcl2fastq_info = bcl_to_fastq_info(bcl2fastq)
                print "Using bcl2fastq %s: %s" % (bcl2fastq_info[-1],
                                                  bcl2fastq)
                # Store info on bcl2fastq package
                ap.metadata['bcl2fastq_software'] = bcl2fastq_info
                # Store info on cellranger package
                ap.metadata['cellranger_software'] = cellranger_software_info
                # Put a copy of sample sheet in the log directory
                shutil.copy(sample_sheet, ap.log_dir)
                # Determine output directory absolute path
                output_dir = ap.params.unaligned_dir
                if not os.path.isabs(output_dir):
                    output_dir = os.path.join(ap.analysis_dir, output_dir)
                # Run cellranger mkfastq
                exit_code = run_cellranger_mkfastq(
                    sample_sheet=sample_sheet,
                    primary_data_dir=primary_data_dir,
                    output_dir=output_dir,
                    lanes=(None if lanes is None else ','.join(
                        [str(l) for l in lanes])),
                    bases_mask=bases_mask,
                    cellranger_exe=cellranger,
                    cellranger_jobmode=cellranger_jobmode,
                    cellranger_maxjobs=cellranger_maxjobs,
                    cellranger_mempercore=cellranger_mempercore,
                    cellranger_jobinterval=cellranger_jobinterval,
                    cellranger_localcores=cellranger_localcores,
                    cellranger_localmem=cellranger_localmem,
                    working_dir=ap.analysis_dir,
                    log_dir=ap.log_dir)
            except Exception as ex:
                raise Exception("'cellranger mkfastq' stage failed: "
                                "'%s'" % ex)
            # Turn off barcode analysis
            analyse_barcodes = False
        elif protocol == '10x_chromium_sc_atac':
            # 10xGenomics Chromium scATAC-seq
            exit_code = bcl_to_fastq_10x_chromium_sc_atac(
                ap,
                output_dir=ap.params.unaligned_dir,
                sample_sheet=sample_sheet,
                primary_data_dir=primary_data_dir,
                lanes=lanes,
                bases_mask=bases_mask,
                cellranger_jobmode=cellranger_jobmode,
                cellranger_maxjobs=cellranger_maxjobs,
                cellranger_mempercore=cellranger_mempercore,
                cellranger_jobinterval=cellranger_jobinterval,
                cellranger_localcores=cellranger_localcores,
                cellranger_localmem=cellranger_localmem,
                log_dir=ap.log_dir)
            # Turn off barcode analysis
            analyse_barcodes = False
        else:
            # Unknown protocol
            raise Exception("Unknown protocol '%s'" % protocol)
        # Check the outputs
        if exit_code != 0:
            raise Exception("Fastq generation finished with error: "
                            "exit code %d" % exit_code)
        if not verify_fastq_generation(
                ap, lanes=lanes, include_sample_dir=verify_include_sample_dir):
            # Check failed
            logger.error("Failed to verify output Fastqs against "
                         "sample sheet")
            # Try to load the data from unaligned dir
            try:
                illumina_data = IlluminaData.IlluminaData(
                    ap.analysis_dir, unaligned_dir=ap.params.unaligned_dir)
            except IlluminaData.IlluminaDataError as ex:
                raise Exception("Unable to load data from %s: %s" %
                                (ap.params.unaligned_dir, ex))
            # Generate a list of missing Fastqs
            missing_fastqs = IlluminaData.list_missing_fastqs(
                illumina_data,
                sample_sheet,
                include_sample_dir=verify_include_sample_dir)
            assert (len(missing_fastqs) > 0)
            missing_fastqs_file = os.path.join(ap.log_dir,
                                               "missing_fastqs.log")
            print "Writing list of missing Fastq files to %s" % \
                missing_fastqs_file
            with open(missing_fastqs_file, 'w') as fp:
                for fq in missing_fastqs:
                    fp.write("%s\n" % fq)
            # Create empty FASTQs
            if create_empty_fastqs is None:
                try:
                    create_empty_fastqs = \
                        ap.settings.platform[ap.metadata.platform].\
                        create_empty_fastqs
                except (KeyError, AttributeError):
                    pass
            if create_empty_fastqs is None:
                create_empty_fastqs = \
                    ap.settings.bcl2fastq.create_empty_fastqs
            if create_empty_fastqs:
                logger.warning("Making 'empty' placeholder Fastqs")
                for fq in missing_fastqs:
                    fastq = os.path.join(ap.analysis_dir,
                                         ap.params.unaligned_dir, fq)
                    print "-- %s" % fastq
                    if not os.path.exists(os.path.dirname(fastq)):
                        mkdirs(os.path.dirname(fastq))
                    with gzip.GzipFile(filename=fastq, mode='wb') as fp:
                        fp.write('')
            else:
                raise Exception("Fastq generation failed to produce "
                                "expected outputs")
    # Generate statistics
    if generate_stats:
        fastq_statistics(ap,
                         stats_file=stats_file,
                         per_lane_stats_file=per_lane_stats_file,
                         unaligned_dir=ap.params.unaligned_dir,
                         nprocessors=nprocessors,
                         runner=runner)
    # Run barcode analysis
    if analyse_barcodes:
        # Determine output directory
        if barcode_analysis_dir is not None:
            ap.params['barcode_analysis_dir'] = barcode_analysis_dir
        elif ap.params.barcode_analysis_dir is None:
            ap.params['barcode_analysis_dir'] = 'barcode_analysis'
        barcode_analysis_dir = ap.params.barcode_analysis_dir
        if not os.path.isabs(barcode_analysis_dir):
            barcode_analysis_dir = os.path.join(ap.params.analysis_dir,
                                                barcode_analysis_dir)
        # Report title
        title = "Barcode analysis for %s" % ap.metadata.run_name
        # Log file
        log_file = os.path.join(ap.log_dir, "analyse_barcodes.log")
        # Set up runner
        if runner is None:
            runner = ap.settings.general.default_runner
        runner.set_log_dir(ap.log_dir)
        # Get scheduler parameters
        max_jobs = ap.settings.general.max_concurrent_jobs
        poll_interval = ap.settings.general.poll_interval
        # Create and run barcode analysis pipeline
        barcode_analysis = AnalyseBarcodes(
            os.path.join(ap.params.analysis_dir, ap.params.unaligned_dir))
        barcode_analysis.run(barcode_analysis_dir,
                             title=title,
                             lanes=lanes,
                             sample_sheet=sample_sheet,
                             log_file=log_file,
                             runner=runner,
                             max_jobs=max_jobs,
                             poll_interval=poll_interval,
                             verbose=False)
    # Make a 'projects.info' metadata file
    if lanes:
        ap.update_project_metadata_file()
    else:
        ap.make_project_metadata_file()
    # Remove primary data
    if remove_primary_data:
        remove_primary_data(ap)