Пример #1
0
 def load_illumina_data(self, unaligned_dir=None):
     # Load and return an IlluminaData object
     if unaligned_dir is None:
         unaligned_dir = self.params.unaligned_dir
     if unaligned_dir is None:
         logging.error(
             "Unaligned directory not specified, cannot load data")
         return None
     return IlluminaData.IlluminaData(self.analysis_dir,
                                      unaligned_dir=unaligned_dir)
Пример #2
0
def get_fastqs_from_dir(dirn, lane, unaligned_dir=None):
    """Automatically collect Fastq files for specified lane

    """
    try:
        illumina_data = IlluminaData.IlluminaData(dirn,
                                                  unaligned_dir=unaligned_dir)
    except Exception, ex:
        sys.stderr.write("Unable to read fastqs from %s: %s\n" % (dirn, ex))
        sys.exit(1)
Пример #3
0
def verify_fastq_generation(ap,
                            unaligned_dir=None,
                            lanes=None,
                            include_sample_dir=False):
    """Check that generated Fastqs match sample sheet predictions

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the analysis
        directory to do Fastqs verification on
      unaligned_dir (str): explicitly specify the bcl2fastq output
        directory to check
      lanes (list): specify a list of lane numbers (integers) to
        check (others will be ignored)
      include_sample_dir (bool): if True then include a
        'sample_name' directory level when checking for
         bcl2fastq2 outputs, even if one shouldn't be present

     Returns:
       True if outputs match sample sheet, False otherwise.
    """
    if unaligned_dir is None:
        if ap.params.unaligned_dir is not None:
            unaligned_dir = ap.params.unaligned_dir
        else:
            raise Exception("Bcl2fastq output directory not defined")
    print "Checking bcl2fastq output directory '%s'" % unaligned_dir
    bcl_to_fastq_dir = os.path.join(ap.analysis_dir, unaligned_dir)
    if not os.path.isdir(bcl_to_fastq_dir):
        # Directory doesn't exist
        return False
    # Make a temporary sample sheet to verify against
    tmp_sample_sheet = os.path.join(
        ap.tmp_dir,
        "SampleSheet.verify.%s.csv" % time.strftime("%Y%m%d%H%M%S"))
    make_custom_sample_sheet(ap.params.sample_sheet,
                             tmp_sample_sheet,
                             lanes=lanes)
    # Try to create an IlluminaData object
    try:
        illumina_data = IlluminaData.IlluminaData(ap.analysis_dir,
                                                  unaligned_dir=unaligned_dir)
    except IlluminaData.IlluminaDataError as ex:
        # Failed to initialise
        logger.warning("Failed to get information from %s: %s" %
                       (bcl_to_fastq_dir, ex))
        return False
    # Do check
    return IlluminaData.verify_run_against_sample_sheet(
        illumina_data, tmp_sample_sheet, include_sample_dir=include_sample_dir)
def get_fastqs_from_dir(dirn, lane, unaligned_dir=None):
    """
    Collect Fastq files for specified lane

    Arguments:
      dirn (str): path to directory to collect Fastq
        files from
      lane (int): lane Fastqs must have come from
      unaligned_dir (str): subdirectory of 'dirn' with
        outputs from bcl2fastq

    Returns:
      List: list of Fastqs (for single ended data) or of
        Fastq pairs (for pair ended data).
    """
    try:
        illumina_data = IlluminaData.IlluminaData(dirn,
                                                  unaligned_dir=unaligned_dir)
    except Exception as ex:
        raise Exception("Unable to read fastqs from %s: %s\n" % (dirn, ex))
    paired_end = illumina_data.paired_end
    fastqs_r1 = []
    fastqs_r2 = []
    for project in illumina_data.projects:
        for sample in project.samples:
            for fastq in sample.fastq_subset(read_number=1, full_path=True):
                if IlluminaData.IlluminaFastq(fastq).lane_number == lane:
                    fastqs_r1.append(fastq)
            for fastq in sample.fastq_subset(read_number=2, full_path=True):
                if IlluminaData.IlluminaFastq(fastq).lane_number == lane:
                    fastqs_r2.append(fastq)
    if illumina_data.undetermined:
        for sample in illumina_data.undetermined.samples:
            for fastq in sample.fastq_subset(read_number=1, full_path=True):
                if IlluminaData.IlluminaFastq(fastq).lane_number == lane:
                    fastqs_r1.append(fastq)
            for fastq in sample.fastq_subset(read_number=2, full_path=True):
                if IlluminaData.IlluminaFastq(fastq).lane_number == lane:
                    fastqs_r2.append(fastq)
    if not paired_end:
        return fastqs_r1
    fastqs = []
    fastqs_r1.sort()
    fastqs_r2.sort()
    for fq1, fq2 in zip(fastqs_r1, fastqs_r2):
        fastqs.append("%s,%s" % (fq1, fq2))
    return fastqs
Пример #5
0
 def detect_unaligned_dir(self):
     # Attempt to detect an existing 'bcl2fastq' or 'Unaligned' directory
     # containing data from bcl2fastq
     for test_unaligned in ('bcl2fastq', 'Unaligned'):
         if os.path.isdir(os.path.join(self.analysis_dir, test_unaligned)):
             logging.debug(
                 "Testing subdirectory '%s' to see if it has sequence data"
                 % test_unaligned)
             try:
                 IlluminaData.IlluminaData(self.analysis_dir,
                                           unaligned_dir=test_unaligned)
                 print("Setting 'unaligned_dir' parameter to %s" %
                       test_unaligned)
                 return test_unaligned
             except IlluminaData.IlluminaDataError as ex:
                 logging.debug("Unable to load data from %s" %
                               test_unaligned)
     # Unable to detect existing data directory
     return None
                 "'NY_ChIP-seq'. Use multiple --expt=... to set the types for different "
                 "projects")
    p.add_option("--keep-names",action="store_true",dest="keep_names",default=False,
                 help="preserve the full names of the source fastq files when creating links")
    p.add_option("--merge-replicates",action="store_true",dest="merge_replicates",default=False,
                 help="create merged fastq files for each set of replicates detected")
    # Parse command line
    options,args = p.parse_args()

    # Get data directory name
    if len(args) != 1:
        p.error("expected one argument (location of Illumina analysis dir)")
    illumina_analysis_dir = os.path.abspath(args[0])

    # Populate Illumina data object
    illumina_data = IlluminaData.IlluminaData(illumina_analysis_dir,
                                              unaligned_dir=options.unaligned_dir)

    # Assign experiment types
    for expt in options.expt_type:
        name,type_ = expt.split(':')
        illumina_data.get_project(name).expt_type = type_

    # Create and populate per-project directory structure
    for project in illumina_data.projects:
        create_analysis_dir(project,
                            top_dir=illumina_analysis_dir,
                            merge_replicates=options.merge_replicates,
                            keep_names=options.keep_names,
                            dry_run=options.dry_run)

Пример #7
0
    def get_analysis_projects_from_dirs(self, pattern=None, strict=False):
        """
        Return a list of AnalysisProjects in the analysis directory

        Tests each of the subdirectories in the top-level of the
        analysis directory and rejects any that appear to be
        CASVAVA/bcl2fastq outputs or which don't successfully load
        as AnalysisProject instances.

        Unlike the `get_analysis_projects` method, no checking
        against the project metadata (typically in 'projects.info')
        is performed.

        If the 'pattern' is not None then it should be a simple
        pattern used to match against available names to select
        a subset of projects (see bcf_utils.name_matches).

        Arguments:
          pattern (str): optional pattern to select a subset
            of projects (default: select all projects)
          strict (bool): if True then apply strict checks on
            each discovered project directory before adding it
            to the list (default: don't apply strict checks)

        Returns:
          List: list of AnalysisProject instances.
        """
        logging.debug("Testing subdirectories to determine analysis projects")
        projects = []
        if pattern is None:
            pattern = '*'
        # Try loading each subdirectory as a project
        for dirn in bcf_utils.list_dirs(self.analysis_dir):
            # Test for bcl2fastq output
            try:
                IlluminaData.IlluminaData(self.analysis_dir,
                                          unaligned_dir=dirn)
                logging.debug("* %s: rejected" % dirn)
                continue
            except IlluminaData.IlluminaDataError:
                pass
            except Exception as ex:
                logging.debug("Exception when attempting to load "
                              "subdir '%s' as CASAVA/bcl2fastq output "
                              "(ignored): %s" % (dirn, ex))
            # Try loading as a project
            test_project = AnalysisProject(
                dirn, os.path.join(self.analysis_dir, dirn))
            if strict:
                # Apply strict checks
                if not test_project.is_analysis_dir:
                    logging.debug("* %s: rejected (failed strict checks)" %
                                  dirn)
                    continue
            else:
                # Basic check: are there any samples?
                if not len(test_project.samples):
                    logging.debug("* %s: rejected (no samples)" % dirn)
                    continue
            # Passed checks
            logging.debug("* %s: analysis directory" % dirn)
            if bcf_utils.name_matches(test_project.name, pattern):
                projects.append(test_project)
        return projects
Пример #8
0
    def update_project_metadata_file(self,
                                     unaligned_dir=None,
                                     project_metadata_file='projects.info'):
        """
        Update project metadata file from bcl2fastq outputs

        Updates the contents of the project metadata file
        (default: "projects.info") from a bcl-to-fastq output
        directory, by adding new entries for projects in the
        bcl-to-fastq outputs which don't currently appear.

        Arguments:
          unaligned_dir (str): path to the bcl-to-fastq
            output directory relative to the analysis dir.
            Defaults to the unaligned dir stored in the
            analysis directory parameter file.
          project_metatadata_file (str): optional, path to
            the project metadata file to update
        """
        if project_metadata_file is not None:
            self.params['project_metadata'] = project_metadata_file
        logging.debug("Project metadata file: %s" %
                      self.params.project_metadata)
        filen = os.path.join(self.analysis_dir, self.params.project_metadata)
        if unaligned_dir is not None:
            self.params['unaligned_dir'] = unaligned_dir
        logging.debug("Unaligned_dir: %s" % self.params.unaligned_dir)
        illumina_data = IlluminaData.IlluminaData(
            self.analysis_dir, unaligned_dir=self.params.unaligned_dir)
        if os.path.exists(filen):
            # Load data from existing file
            logging.debug("Loading project metadata from existing file: %s" %
                          filen)
            project_metadata = ProjectMetadataFile(filen)
        else:
            # New (empty) metadata file
            logging.debug("Creating new project metadata file: %s" % filen)
            project_metadata = ProjectMetadataFile()
        # Get projects and samples
        projects = {}
        for project in illumina_data.projects:
            projects[project.name] = sorted([s.name for s in project.samples])
        # Add data from metadata file
        for line in project_metadata:
            project_name = line['Project']
            project_is_commented = project_name.startswith('#')
            # Uncomment project line for now
            project_name = project_name.lstrip('#')
            # Add to the list if not found
            if project_name not in projects:
                if project_is_commented or \
                   not os.path.exists(os.path.join(self.analysis_dir,
                                                   project_name)):
                    # Comment out project not in latest list
                    # if already commented or if project directory
                    # doesn't exist
                    project_name = "#%s" % project_name
                projects[project_name] = line['Samples'].split(',')
        # Populate/update
        for project_name in projects:
            sample_names = projects[project_name]
            if project_name not in project_metadata:
                project_metadata.add_project(project_name, sample_names)
            else:
                project_metadata.update_project(project_name,
                                                sample_names=sample_names)
        # Save
        project_metadata.save(filen)
        print("Updated project metadata file '%s'" %
              self.params.project_metadata)
Пример #9
0
         else:
             lanes = []
             for line in sample_sheet:
                 lane = int(line['Lane'])
                 if lane not in lanes: lanes.append(lane)
         barcodes = get_barcodes_from_sample_sheet(sample_sheet,
                                                   lanes=lanes,
                                                   length=options.length)
         match_barcodes(counts,barcodes,
                        nseqs=options.n,
                        max_mismatches=options.mismatches,
                        cutoff=options.cutoff,
                        fp=fp)
 elif len(args) == 1 and os.path.isdir(args[0]):
     # Dealing with a bclToFastq output dir
     illumina_data = IlluminaData.IlluminaData(os.path.dirname(args[0]),
                                               unaligned_dir=os.path.basename(args[0]))
     # Assign fastqs to lanes (R1 only)
     fastq_in_lane = dict()
     for p in illumina_data.projects:
         for s in p.samples:
             for f in s.fastq_subset(read_number=1,full_path=True):
                 lane = IlluminaData.IlluminaFastq(f).lane_number
                 if lane not in fastq_in_lane:
                     fastq_in_lane[lane] = []
                 fastq_in_lane[lane].append(f)
     if illumina_data.undetermined:
         for s in illumina_data.undetermined.samples:
             for f in s.fastq_subset(read_number=1,full_path=True):
                 lane = IlluminaData.IlluminaFastq(f).lane_number
                 if lane not in fastq_in_lane:
                     fastq_in_lane[lane] = []
Пример #10
0
    def __init__(self, unaligned_dir=None):
        """
        Create a new AnalyseBarcodes pipeline instance

        Arguments:
          unaligned_dir (str): path to the directory
            with outputs from bcl2fastq
        """
        # Initialise the pipeline superclass
        Pipeline.__init__(self, name="Analyse Barcodes")

        # Define parameters
        self.add_param('barcode_analysis_dir', type=str)
        self.add_param('counts_dir', type=str)
        self.add_param('title', type=str)
        self.add_param('lanes', type=list)
        self.add_param('sample_sheet', type=str)
        self.add_param('bases_mask', type=str)
        self.add_param('mismatches', type=int)
        self.add_param('cutoff', type=float)
        self.add_param('force', type=bool, value=False)

        # Load data from bcl2fastq output
        if not os.path.exists(unaligned_dir):
            raise OSError("'%s': not found" % unaligned_dir)
        analysis_dir = os.path.abspath(os.path.dirname(unaligned_dir))
        unaligned_dir = os.path.basename(unaligned_dir)
        illumina_data = IlluminaData.IlluminaData(analysis_dir,
                                                  unaligned_dir=unaligned_dir)

        # Example Fastq file used for determining mismatches in
        # absence of bases mask
        example_fastq = illumina_data.projects[0].samples[0].fastq_subset(
            read_number=1, full_path=True)[0]

        ####################
        # Build the pipeline
        ####################

        # Setup barcode analysis and counts directories
        setup_barcode_analysis_dir = SetupBarcodeAnalysisDirs(
            "Setup barcode analysis directory",
            self.params.barcode_analysis_dir,
            self.params.counts_dir,
            force=self.params.force)
        self.add_task(setup_barcode_analysis_dir)

        # Generate counts for Fastqs in each project
        count_tasks = []
        for project in illumina_data.projects:
            count_barcodes = CountBarcodes("Count barcodes in '%s'" %
                                           project.name,
                                           project,
                                           self.params.counts_dir,
                                           lanes=self.params.lanes)
            self.add_task(count_barcodes,
                          requires=(setup_barcode_analysis_dir, ))
            count_tasks.append(count_barcodes)

        # Generate counts for undetermined Fastqs
        if illumina_data.undetermined is not None:
            count_barcodes = CountBarcodes("Count barcodes in 'undetermined'",
                                           illumina_data.undetermined,
                                           self.params.counts_dir,
                                           lanes=self.params.lanes,
                                           use_project_name="undetermined")
            self.add_task(count_barcodes,
                          requires=(setup_barcode_analysis_dir, ))
            count_tasks.append(count_barcodes)

        # List the counts files
        list_counts_files = ListBarcodeCountFiles(
            "Fetch the barcode counts files", self.params.counts_dir)
        self.add_task(list_counts_files, requires=count_tasks)

        # Analyse counts and report the results
        report_barcodes = ReportBarcodeAnalysis(
            "Report barcode analysis",
            list_counts_files.output.counts_files,
            self.params.barcode_analysis_dir,
            sample_sheet=self.params.sample_sheet,
            lanes=self.params.lanes,
            mismatches=self.params.mismatches,
            cutoff=self.params.cutoff,
            title=self.params.title)
        self.add_task(report_barcodes, requires=(list_counts_files, ))

        # Add final outputs to the pipeline
        self.add_output('report_file', report_barcodes.output.report_file)
        self.add_output('xls_file', report_barcodes.output.xls_file)
        self.add_output('html_file', report_barcodes.output.html_file)
                   "when creating links")
    p.add_argument("--merge-replicates",action="store_true",
                   dest="merge_replicates",default=False,
                   help="create merged fastq files for each set of "
                   "replicates detected")
    p.add_argument('illumina_data_dir',
                   help="top-level directory containing the 'Unaligned' "
                   "directory with the fastq.gz files")
    # Parse command line
    args = p.parse_args()

    # Get data directory name
    illumina_analysis_dir = os.path.abspath(args.illumina_data_dir)

    # Populate Illumina data object
    illumina_data = IlluminaData.IlluminaData(illumina_analysis_dir,
                                              unaligned_dir=args.unaligned_dir)

    # Assign experiment types
    for expt in args.expt_type:
        name,type_ = expt.split(':')
        illumina_data.get_project(name).expt_type = type_

    # Create and populate per-project directory structure
    for project in illumina_data.projects:
        create_analysis_dir(project,
                            top_dir=illumina_analysis_dir,
                            merge_replicates=args.merge_replicates,
                            keep_names=args.keep_names,
                            dry_run=args.dry_run)

Пример #12
0
            logging.fatal("No file '%s': cannot update" % existing_stats_file)
            sys.exit(1)
    else:
        existing_stats_file = None

    # Ignore 'force'
    if options.force:
        logger.warn("ignoring deprecated option '--force'")

    # Handle debugging output if requested
    if options.debug:
        logging.getLogger("auto_process_ngs").setLevel(logging.DEBUG)

    # Get the data from FASTQ files
    try:
        illumina_data = IlluminaData.IlluminaData(
            args[0], unaligned_dir=options.unaligned_dir)
    except IlluminaData.IlluminaDataError, ex:
        logger.critical("failed to get data from %s: %s" % (args[0], ex))
        sys.exit(1)
    # Generate statistics for fastq files
    stats = FastqStatistics(illumina_data,
                            n_processors=options.n,
                            add_to=existing_stats_file)
    stats.report_full_stats(options.full_stats_file)
    print "Full statistics written to %s" % options.full_stats_file
    stats.report_basic_stats(options.stats_file)
    print "Basic statistics written to %s" % options.stats_file
    stats.report_per_lane_sample_stats(options.per_lane_sample_stats_file)
    print "Per-lane sample statistics written to %s" % \
        options.per_lane_sample_stats_file
    stats.report_per_lane_summary_stats(options.per_lane_stats_file)
def merge_fastq_dirs(ap,
                     primary_unaligned_dir,
                     output_dir=None,
                     dry_run=False):
    """
    Combine multiple 'unaligned' output directories into one
    
    This method combines the output from multiple runs of
    CASAVA/bcl2fastq into a single 'unaligned'-equivalent
    directory.

    Currently it operates in an automatic mode and should
    detect additional 'unaligned' dirs on its own.

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the parent
        analysis directory
      primary_unaligned_dir (str): the 'unaligned' dir that
        data from from all others will be put into (relative
        path), unless overridden by 'output_dir' argument
      output_dir (str): optional, new 'unaligned' dir that
        will be created to hold merged data (relative path,
        defaults to 'primary_unaligned_dir')
      dry_run (boolean): if True then just report operations
        that would have been performed.
    """
    if primary_unaligned_dir is None:
        raise Exception("Primary unaligned dir not defined")
    # Output directory
    if output_dir is None:
        output_dir = primary_unaligned_dir
    print("Fastqs will be merged into '%s'" % output_dir)
    # Collect unaligned dirs
    print("Collecting bcl2fastq directories")
    primary_illumina_data = None
    unaligned_dirs = {}
    for dirn in list_dirs(ap.analysis_dir):
        try:
            illumina_data = IlluminaData.IlluminaData(ap.analysis_dir,
                                                      unaligned_dir=dirn)
            if dirn == primary_unaligned_dir:
                print("* %s (primary dir)" % dirn)
                primary_illumina_data = illumina_data
            elif dirn.endswith(".bak") or dirn.startswith("save."):
                print("Ignoring %s" % dirn)
            else:
                print("* %s" % dirn)
                unaligned_dirs[dirn] = illumina_data
        except Exception as ex:
            logger.debug("Rejecting %s: %s" % (dirn, ex))
    # Check primary unaligned dir
    if primary_illumina_data is None:
        raise Exception("Primary dir '%s' doesn't exist, or doesn't "
                        "contain data?" % primary_unaligned_dir)
    # Is there anything to do?
    if not unaligned_dirs:
        print("No extra bcl2fastq output directories found, nothing to do")
        return 0
    # Make log directory and set up scheduler (if not dry run)
    if not dry_run:
        ap.set_log_dir(ap.get_log_subdir('merge_fastq_dirs'))
        runner = ap.settings.general.default_runner
        runner.set_log_dir(ap.log_dir)
        sched = SimpleScheduler(
            runner=runner,
            max_concurrent=ap.settings.general.max_concurrent_jobs,
            poll_interval=ap.settings.general.poll_interval)
        sched.start()
        jobs = []
    # Top-level for undetermined reads
    if primary_illumina_data.undetermined.dirn != \
       primary_illumina_data.unaligned_dir:
        undetermined_dir = os.path.basename(
            primary_illumina_data.undetermined.dirn)
    else:
        undetermined_dir = None
    # Do sanity checks before proceeding
    print("Checking primary data directory")
    fmt = primary_illumina_data.format
    paired_end = primary_illumina_data.paired_end
    no_lane_splitting = (len(primary_illumina_data.lanes) == 1) \
                        and (primary_illumina_data.lanes[0] is None)
    print("* Format: %s" % fmt)
    print("* no-lane-splitting: %s" % ('yes' if no_lane_splitting else 'no'))
    print("* paired-end: %s" % ('yes' if paired_end else 'no'))
    print("* undetermined dir: %s" % undetermined_dir)
    consistent_data = True
    for unaligned_dir in unaligned_dirs:
        illumina_data = unaligned_dirs[unaligned_dir]
        fmt0 = illumina_data.format
        no_lane_splitting0 = (len(illumina_data.lanes) == 1) \
                             and (primary_illumina_data.lanes[0] is None)
        if (fmt0 != fmt) or (no_lane_splitting0 != no_lane_splitting):
            print("!!! %s: inconsistent format to primary data dir !!!" %
                  unaligned_dir)
            consistent_data = False
    if not consistent_data:
        raise Exception("Data directories not consistent with primary "
                        "dir '%s'" % primary_unaligned_dir)
    # Collect the projects from the extra directories
    projects = []
    undetermined = []
    for unaligned_dir in unaligned_dirs:
        print("Examining projects in %s:" % unaligned_dir)
        illumina_data = unaligned_dirs[unaligned_dir]
        for project in illumina_data.projects:
            if not list(filter(lambda p: p.name == project.name, projects)):
                print("- %s: will be merged in" % project.name)
                projects.append(project)
            else:
                raise Exception("collision: %s already exists" % project.name)
        # Deal with undetermined reads
        if illumina_data.undetermined is not None:
            print("Examining undetermined samples:")
            if no_lane_splitting:
                # No lane info: should merge undetermined fastqs
                for sample in illumina_data.undetermined.samples:
                    print("- %s: reads will be concatenated" % sample.name)
                    undetermined.append(sample)
            else:
                for sample in illumina_data.undetermined.samples:
                    if not list(
                            filter(lambda s: s.name == sample.name,
                                   undetermined)):
                        print("- %s: will be merged in" % sample.name)
                        undetermined.append(sample)
                    else:
                        raise Exception("collision: %s already exists" %
                                        sample.name)
        else:
            print("No undetermined samples")
    # Collect any remaining projects from the primary
    # unaligned directory
    print("Examining projects in primary dir %s:" % primary_unaligned_dir)
    for project in primary_illumina_data.projects:
        if not list(filter(lambda p: p.name == project.name, projects)):
            print("- %s: will be merged in" % project.name)
            projects.append(project)
        else:
            print("- %s: already exists, will be discarded" % project.name)
    # Sort out the undetermined reads
    print("Examining undetermined samples:")
    if no_lane_splitting:
        # No lane info: should merge undetermined fastqs
        for sample in primary_illumina_data.undetermined.samples:
            print("- %s: reads will be concatenated" % sample.name)
            undetermined.insert(0, sample)
    else:
        for sample in primary_illumina_data.undetermined.samples:
            if not list(filter(lambda s: s.name == sample.name, undetermined)):
                print("- %s: will be merged in" % sample.name)
                undetermined.insert(0, sample)
            else:
                print("- %s: already exists, will be discarded" % sample.name)
    # Make a new directory for the merging
    merge_dir = os.path.join(ap.analysis_dir, output_dir + ".new")
    if undetermined_dir is not None:
        merge_undetermined_dir = os.path.join(merge_dir, undetermined_dir)
    else:
        merge_undetermined_dir = merge_dir
    if not dry_run:
        print("Making temporary merge directory %s" % merge_dir)
        mkdir(merge_dir)
        if not os.path.exists(merge_undetermined_dir):
            print("Making directory for undetermined %s" %
                  merge_undetermined_dir)
            mkdir(merge_undetermined_dir)
    # Copy the projects
    print("Importing projects:")
    for project in projects:
        print("- %s" % project.name)
        project_dir = os.path.join(merge_dir, os.path.basename(project.dirn))
        cmd = copytree_command(project.dirn, project_dir)
        print("- Running %s" % cmd)
        if not dry_run:
            job = sched.submit(cmd,
                               name="copy_project.%s" % project.name,
                               wd=merge_dir)
            print("Job: %s" % job)
            jobs.append(job)
    # Handle the undetermined reads
    print("Dealing with undetermined reads:")
    if no_lane_splitting:
        # No lane info: merge undetermined fastqs
        if len(undetermined) == 1:
            # Only one undetermined sample - copy Fastqs
            for read in (1, 2):
                if read == 2 and not paired_end:
                    break
                fastqs = sample.fastq_subset(read_number=read, full_path=True)
                for fq in fastqs:
                    cmd = copy_command(fq, merge_undetermined_dir)
                    print("- Running %s" % cmd)
                    if not dry_run:
                        job = sched.submit(cmd,
                                           name="copy_undetermined.R%s" % read,
                                           wd=merge_dir)
                        print("Job: %s" % job)
                        jobs.append(job)
        else:
            # Multiple undetermined samples - concat Fastqs
            for read in (1, 2):
                if read == 2 and not paired_end:
                    break
                cmd = Command('concat_fastqs.py')
                for sample in undetermined:
                    fastqs = sample.fastq_subset(read_number=read,
                                                 full_path=True)
                    cmd.add_args(*fastqs)
                cmd.add_args(
                    os.path.join(merge_undetermined_dir,
                                 "Undetermined_S0_R%s_001.fastq.gz" % read))
                print("- Running %s" % cmd)
                if not dry_run:
                    job = sched.submit(cmd,
                                       name="merge_undetermined.R%s" % read,
                                       wd=merge_dir)
                    print("Job: %s" % job)
                    jobs.append(job)
    else:
        for sample in undetermined:
            print("- %s" % sample.name)
            if fmt == "bcl2fastq2":
                # Hardlink copy fastqs directly
                sample_dir = merge_undetermined_dir
                if not dry_run:
                    for fq in sample.fastq:
                        src_fq = os.path.join(sample.dirn, fq)
                        dst_fq = os.path.join(sample_dir, fq)
                        os.link(src_fq, dst_fq)
            else:
                # Just copy directory tree wholesale
                sample_dir = os.path.join(merge_undetermined_dir,
                                          os.path.basename(sample.dirn))
                cmd = copytree_command(sample.dirn, sample_dir)
                print("- Running %s" % cmd)
                if not dry_run:
                    job = sched.submit(cmd,
                                       name="copy_sample_dir.%s" % sample.name,
                                       wd=merge_dir)
                    print("Job: %s" % job.name)
                    jobs.append(job)
    # Make expected subdirs for bcl2fastq2
    if not dry_run and fmt == "bcl2fastq2":
        for dirn in ('Reports', 'Stats'):
            mkdir(os.path.join(merge_dir, dirn))
            # Add a hidden placeholder to preserve these directories
            # on rsync -m (prune empty dirs)
            with open(os.path.join(merge_dir, dirn, '.placeholder'),
                      'w') as fp:
                fp.write("")
    # Wait for scheduler jobs to complete
    if not dry_run:
        sched.wait()
        sched.stop()
        # Check job exit status
        exit_status = 0
        for j in jobs:
            exit_status += j.exit_status
            if j.exit_status != 0:
                logger.warning("Job failed: %s" % j)
        if exit_status:
            logger.critical("One or more jobs failed (non-zero "
                            "exit status)")
            return exit_status
    # Move all the 'old' directories out of the way
    all_unaligned = [u for u in unaligned_dirs]
    all_unaligned.append(primary_unaligned_dir)
    for unaligned_dir in all_unaligned:
        unaligned_backup = os.path.join(ap.analysis_dir,
                                        "save.%s" % unaligned_dir)
        print("Moving %s to %s" % (unaligned_dir, unaligned_backup))
        if not dry_run:
            shutil.move(os.path.join(ap.analysis_dir, unaligned_dir),
                        unaligned_backup)
    # Rename the merged directory
    print("Renaming %s to %s" % (merge_dir, output_dir))
    if not dry_run:
        shutil.move(merge_dir, os.path.join(ap.analysis_dir, output_dir))
    # Reset the bcl2fastq dir
    if not dry_run:
        ap.params['unaligned_dir'] = output_dir
    # Make a new 'projects.info' metadata file
    project_metadata_file = os.path.join(ap.analysis_dir, 'projects.info')
    if os.path.exists(project_metadata_file):
        print("Moving existing projects.info file out of the way")
        if not dry_run:
            os.rename(project_metadata_file,
                      os.path.join(ap.analysis_dir, 'save.projects.info'))
    print("Creating new projects.info file")
    if not dry_run:
        ap.make_project_metadata_file()
    return 0
Пример #14
0
    def __init__(self, analysis_dir):
        """Create a new AnalysisDir instance for a specified directory

        Arguments:
          analysis_dir: name (and path) to analysis directory

        """
        # Store location
        self._analysis_dir = os.path.abspath(analysis_dir)
        self._name = os.path.basename(analysis_dir)
        self._bcl2fastq_dirs = []
        self._project_dirs = []
        self._extra_dirs = []
        self.sequencing_data = []
        self.projects = []
        self.undetermined = None
        # Metadata
        self.metadata = AnalysisDirMetadata()
        try:
            metadata_file = os.path.join(self._analysis_dir, "metadata.info")
            self.metadata.load(metadata_file)
        except Exception as ex:
            logger.warning("Failed to load metadata file %s: %s" %
                           (metadata_file, ex))
            logger.warning("Attempting to load parameter file")
            try:
                params = AnalysisDirParameters()
                parameter_file = os.path.join(self._analysis_dir,
                                              "auto_process.info")
                params.load(parameter_file, strict=False)
                # Attempt to acquire values from parameters
                for param in ('platform', 'run_number', 'source', 'assay'):
                    if param not in params:
                        print "-- %s: missing" % param
                        continue
                    print "-- %s: setting to '%s'" % (param, params[param])
                    self.metadata[param] = params[param]
            except Exception as ex:
                # No parameter file either
                logger.warning("Failed to load parameters: %s" % ex)
                logger.warning("Perhaps this is not an auto_process project?")
                raise ex
        # Projects metadata
        try:
            self.projects_metadata = ProjectMetadataFile(
                os.path.join(self._analysis_dir, "projects.info"))
        except Exception as ex:
            logger.warning("Failed to load projects metadata: %s" % ex)
            self.projects_metadata = None
        # Run name
        try:
            self.run_name = self.metadata.run
        except AttributeError:
            self.run_name = self._analysis_dir[0:-len('_analysis')]
        self.run_name = os.path.basename(self.run_name)
        self.date_stamp,\
            self.instrument_name,\
            self.instrument_run_number = IlluminaData.split_run_name(
                self.run_name)
        # Look for outputs from bclToFastq and analysis projects
        logger.debug("Examining subdirectories of %s" % self._analysis_dir)
        for dirn in bcf_utils.list_dirs(self._analysis_dir):
            # Look for sequencing data
            try:
                data = IlluminaData.IlluminaData(self._analysis_dir,
                                                 unaligned_dir=dirn)
                logger.debug("- %s: sequencing data" % dirn)
                self._bcl2fastq_dirs.append(dirn)
                self.sequencing_data.append(data)
                continue
            except IlluminaData.IlluminaDataError:
                pass
            except Exception as ex:
                logger.warning("Exception when attempting to load "
                               "subdir '%s' as CASAVA/bcl2fastq output "
                               "(ignored): %s" % (dirn, ex))
            # Look for analysis data
            data = AnalysisProject(dirn, os.path.join(self._analysis_dir,
                                                      dirn))
            if data.is_analysis_dir:
                if dirn == 'undetermined':
                    logger.debug("- %s: undetermined indexes" % dirn)
                    self.undetermined = data
                else:
                    # Check against projects.info, if possible
                    try:
                        if not self.projects_metadata.lookup('Project', dirn):
                            logger.debug("- %s: not in projects.info" % dirn)
                            self._extra_dirs.append(dirn)
                            continue
                    except AttributeError:
                        pass
                    logger.debug("- %s: project directory" % dirn)
                    self._project_dirs.append(dirn)
                    self.projects.append(data)
                continue
            else:
                # Unidentified contents
                self._extra_dirs.append(dirn)
                logger.debug("- %s: unknown" % dirn)
Пример #15
0
def fastq_statistics(ap,
                     stats_file=None,
                     per_lane_stats_file=None,
                     unaligned_dir=None,
                     sample_sheet=None,
                     add_data=False,
                     nprocessors=None,
                     runner=None):
    """Generate statistics for Fastq files

    Generates statistics for all Fastq files found in the
    'unaligned' directory, by running the 'fastq_statistics.py'
    program.

    Arguments
      ap (AutoProcessor): autoprocessor pointing to the analysis
        directory to create Fastqs for
      stats_file (str): path of a non-default file to write the
        statistics to (defaults to 'statistics.info' unless
        over-ridden by local settings)
      per_lane_stats_file (str): path for per-lane statistics
        output file (defaults to 'per_lane_statistics.info'
        unless over-ridden by local settings)
      unaligned_dir (str): output directory for bcl-to-fastq
        conversion
      sample_sheet (str): path to sample sheet file used in
        bcl-to-fastq conversion
      add_data (bool): if True then add stats to the existing
        stats files (default is to overwrite existing stats
        files)
      nprocessors (int): number of cores to use when running
        'fastq_statistics.py'
      runner (JobRunner): (optional) specify a non-default job
        runner to use for running 'fastq_statistics.py'
    """
    # Get file names for output files
    if stats_file is None:
        if ap.params['stats_file'] is not None:
            stats_file = ap.params['stats_file']
        else:
            stats_file = 'statistics.info'
        if per_lane_stats_file is None:
            if ap.params['per_lane_stats_file'] is not None:
                per_lane_stats_file = ap.params['per_lane_stats_file']
            else:
                per_lane_stats_file = 'per_lane_statistics.info'
    # Sort out unaligned_dir
    if unaligned_dir is None:
        if ap.params.unaligned_dir is None:
            ap.params['unaligned_dir'] = 'bcl2fastq'
        unaligned_dir = ap.params.unaligned_dir
    if not os.path.exists(os.path.join(ap.params.analysis_dir, unaligned_dir)):
        logger.error("Unaligned dir '%s' not found" % unaligned_dir)
    # Check for sample sheet
    if sample_sheet is None:
        sample_sheet = ap.params['sample_sheet']
    # Check if any Fastqs are newer than stats files
    newest_mtime = 0
    for f in (
            stats_file,
            per_lane_stats_file,
    ):
        try:
            newest_mtime = max(newest_mtime, os.path.getmtime(f))
        except OSError:
            # Missing file
            newest_mtime = 0
            break
    illumina_data = IlluminaData.IlluminaData(ap.params.analysis_dir,
                                              unaligned_dir)
    if newest_mtime > 0:
        regenerate_stats = False
        for project in illumina_data.projects:
            for sample in project.samples:
                for fq in sample.fastq:
                    if (os.path.getmtime(os.path.join(sample.dirn, fq)) >
                            newest_mtime):
                        regenerate_stats = True
                        break
        if regenerate_stats:
            logger.warning("Fastqs are newer than stats files")
        else:
            # Don't rerun the stats, just regenerate the report
            logger.warning("Stats files are newer than Fastqs")
            processing_qc_html = os.path.join(ap.analysis_dir,
                                              "processing_qc.html")
            report_processing_qc(ap, processing_qc_html)
            return
    # Set up runner
    if runner is None:
        runner = ap.settings.runners.stats
    runner.set_log_dir(ap.log_dir)
    # Number of cores
    if nprocessors is None:
        nprocessors = ap.settings.fastq_stats.nprocessors
    # Generate statistics
    fastq_statistics_cmd = Command(
        'fastq_statistics.py', '--unaligned', unaligned_dir, '--sample-sheet',
        sample_sheet, '--output',
        os.path.join(ap.params.analysis_dir, stats_file), '--per-lane-stats',
        os.path.join(ap.params.analysis_dir, per_lane_stats_file),
        ap.params.analysis_dir, '--nprocessors', nprocessors)
    if add_data:
        fastq_statistics_cmd.add_args('--update')
    print "Generating statistics: running %s" % fastq_statistics_cmd
    fastq_statistics_job = SchedulerJob(runner,
                                        fastq_statistics_cmd.command_line,
                                        name='fastq_statistics',
                                        working_dir=ap.analysis_dir)
    fastq_statistics_job.start()
    try:
        fastq_statistics_job.wait(
            poll_interval=ap.settings.general.poll_interval)
    except KeyboardInterrupt as ex:
        logger.warning("Keyboard interrupt, terminating fastq_statistics")
        fastq_statistics_job.terminate()
        raise ex
    exit_code = fastq_statistics_job.exit_code
    print "fastq_statistics completed: exit code %s" % exit_code
    if exit_code != 0:
        raise Exception("fastq_statistics exited with an error")
    ap.params['stats_file'] = stats_file
    ap.params['per_lane_stats_file'] = per_lane_stats_file
    print "Statistics generation completed: %s" % ap.params.stats_file
    print "Generating processing QC report"
    processing_qc_html = os.path.join(ap.analysis_dir, "processing_qc.html")
    report_processing_qc(ap, processing_qc_html)
Пример #16
0
def make_fastqs(ap,
                protocol='standard',
                platform=None,
                unaligned_dir=None,
                sample_sheet=None,
                lanes=None,
                ignore_missing_bcl=False,
                ignore_missing_stats=False,
                skip_rsync=False,
                remove_primary_data=False,
                nprocessors=None,
                require_bcl2fastq_version=None,
                bases_mask=None,
                no_lane_splitting=None,
                minimum_trimmed_read_length=None,
                mask_short_adapter_reads=None,
                generate_stats=True,
                stats_file=None,
                per_lane_stats_file=None,
                analyse_barcodes=True,
                barcode_analysis_dir=None,
                skip_fastq_generation=False,
                only_fetch_primary_data=False,
                create_empty_fastqs=None,
                runner=None,
                cellranger_jobmode=None,
                cellranger_mempercore=None,
                cellranger_maxjobs=None,
                cellranger_jobinterval=None,
                cellranger_localcores=None,
                cellranger_localmem=None,
                cellranger_ignore_dual_index=False):
    """Create and summarise FASTQ files

    Wrapper for operations related to FASTQ file generation and analysis.
    The operations are typically:
 
    - get primary data (BCL files)
    - run bcl-to-fastq conversion
    - generate statistics

    If the number of processors and the job runner are not explicitly
    specified then these are taken from the settings for the bcl2fastq
    and the statistics generation steps, which may differ from each other.
    However if either of these values are set explicitly then the same
    values will be used for both steps.

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the analysis
        directory to create Fastqs for
      protocol (str): if set then specifies the protocol to use
        for fastq generation, otherwise use the 'standard' bcl2fastq
        protocol
      platform (str): if set then specifies the sequencing platform
        (otherwise platform will be determined from the primary data)
      unaligned_dir (str): if set then use this as the output directory
        for bcl-to-fastq conversion. Default is 'bcl2fastq' (unless
        an alternative is already specified in the config file)
      sample_sheet (str): if set then use this as the input samplesheet
      lanes (list): (optional) specify a list of lane numbers to
        use in the processing; lanes not in the list will be excluded
        (default is to include all lanes)
      nprocessors (int) : number of processors to run bclToFastq.py with
      ignore_missing_bcl (bool): if True then run bcl2fastq with
        --ignore-missing-bcl
      ignore_missing_stats (bool): if True then run bcl2fastq with
        --ignore-missing-stats
      skip_rsync (bool): if True then don't rsync primary data at the
        start of bcl2fastq conversion
      remove_primary_data (bool): if True then remove primary data at
        the end of bcl2fastq conversion (default is to keep it)
      generate_stats (bool): if True then (re)generate statistics file
        for fastqs
      analyse_barcodes (bool): if True then (re)analyse barcodes for
        fastqs
      require_bcl2fastq_version (str): (optional) specify bcl2fastq
        version to use. Should be a string of the form '1.8.4' or
        '>2.0'. Set to None to automatically determine required
        bcl2fastq version.
      bases_mask (str): if set then use this as an alternative bases
        mask setting
      no_lane_splitting (bool): if True then run bcl2fastq with
        --no-lane-splitting
      minimum_trimmed_read_length (int): if set then specify minimum
        length for reads after adapter trimming (shorter reads will
        be padded with Ns to make them long enough)
      mask_short_adapter_reads (int): if set then specify the minimum
        length of ACGT bases that must be present in a read after
        adapter trimming for it not to be masked completely
        with Ns.
      stats_file (str): if set then use this as the name of the output
        per-fastq stats file.
      per_lane_stats_file (str): if set then use this as the name of
        the output per-lane stats file.
      barcode_analysis_dir (str): if set then specifies path to the
        output directory for barcode analysis
      skip_fastq_generation (bool): if True then don't perform fastq
        generation
      only_fetch_primary_data (bool): if True then fetch primary data,
        don't do anything else
      create_empty_fastqs (bool): if True then create empty 'placeholder'
        fastq files for any missing fastqs after bcl2fastq
        (must have completed with zero exit status)
      runner (JobRunner): (optional) specify a non-default job runner
        to use for fastq generation
      cellranger_jobmode (str): (optional) job mode to run cellranger in
        (10xGenomics Chromium SC data only)
      cellranger_mempercore (int): (optional) memory assumed per core
        (in Gbs) (10xGenomics Chromium SC data only)
      cellranger_maxjobs (int): (optional) maxiumum number of concurrent
         jobs to run (10xGenomics Chromium SC data only)
      cellranger_jobinterval (int): (optional) how often jobs are
         submitted (in ms) (10xGenomics Chromium SC data only)
      cellranger_localcores (int): (optional) maximum number of cores
         cellranger can request in jobmode 'local' (10xGenomics Chromium
         SC data only)
      cellranger_localmem (int): (optional) maximum memory cellranger
         can request in jobmode 'local' (10xGenomics Chromium SC data
         only)
      cellranger_ignore_dual_index (bool): (optional) on a dual-indexed
         flowcell where the second index was not used for the 10x
         sample, ignore it (10xGenomics Chromium SC data only)
    """
    # Report protocol
    print "Protocol              : %s" % protocol
    if protocol not in MAKE_FASTQS_PROTOCOLS:
        raise Exception("Unknown protocol: '%s' (must be one of "
                        "%s)" % (protocol, ','.join([MAKE_FASTQS_PROTOCOLS])))
    # Unaligned dir
    if unaligned_dir is not None:
        ap.params['unaligned_dir'] = unaligned_dir
    elif ap.params['unaligned_dir'] is None:
        ap.params['unaligned_dir'] = 'bcl2fastq'
    print "Output dir            : %s" % ap.params.unaligned_dir
    # Sample sheet
    if sample_sheet is None:
        sample_sheet = ap.params.sample_sheet
    if not os.path.isabs(sample_sheet):
        sample_sheet = os.path.join(ap.analysis_dir, sample_sheet)
    if not os.path.isfile(sample_sheet):
        raise Exception("Missing sample sheet '%s'" % sample_sheet)
    ap.params['sample_sheet'] = sample_sheet
    print "Source sample sheet   : %s" % ap.params.sample_sheet
    # Check requested lanes are actually present
    print "Lanes                 : %s" % ('all' if lanes is None else ','.join(
        [str(l) for l in lanes]))
    if lanes is not None:
        s = IlluminaData.SampleSheet(ap.params.sample_sheet)
        if not s.has_lanes:
            raise Exception("Requested subset of lanes but "
                            "samplesheet doesn't contain any "
                            "lane information")
        samplesheet_lanes = list(set([l['Lane'] for l in s]))
        for l in lanes:
            if l not in samplesheet_lanes:
                raise Exception("Requested lane '%d' not present "
                                "in samplesheet" % l)
    # Make a temporary sample sheet
    if lanes:
        lanes_id = ".L%s" % ''.join([str(l) for l in lanes])
    else:
        lanes_id = ""
    sample_sheet = os.path.join(
        ap.tmp_dir,
        "SampleSheet%s.%s.csv" % (lanes_id, time.strftime("%Y%m%d%H%M%S")))
    make_custom_sample_sheet(ap.params.sample_sheet, sample_sheet, lanes=lanes)
    # Check the temporary sample sheet
    print "Checking temporary sample sheet"
    invalid_barcodes = SampleSheetLinter(
        sample_sheet_file=sample_sheet).has_invalid_barcodes()
    if invalid_barcodes:
        logger.error("Invalid barcodes detected")
        for line in invalid_barcodes:
            logger.critical("%s" % line)
    invalid_characters = SampleSheetLinter(
        sample_sheet_file=sample_sheet).has_invalid_characters()
    if invalid_characters:
        logger.critical("Invalid non-printing/non-ASCII characters "
                        "detected")
    if invalid_barcodes or invalid_characters:
        raise Exception("Errors detected in generated sample sheet")
    # Adjust verification settings for 10xGenomics Chromium SC
    # data if necessary
    verify_include_sample_dir = False
    if has_chromium_sc_indices(sample_sheet):
        if protocol in (
                '10x_chromium_sc',
                '10x_chromium_sc_atac',
        ):
            # Force inclusion of sample-name subdirectories
            # when verifying Chromium SC data
            print "Sample sheet includes Chromium SC indices"
            verify_include_sample_dir = True
        else:
            # Chromium SC indices detected but not using
            # 10x_chromium_sc protocol
            raise Exception("Detected 10xGenomics Chromium SC indices "
                            "in generated sample sheet but protocol "
                            "'%s' has been specified; use an "
                            "appropriate '10x_...' protocol for these "
                            "indices" % protocol)
    # Check for pre-existing Fastq outputs
    if verify_fastq_generation(ap,
                               unaligned_dir=ap.params.unaligned_dir,
                               lanes=lanes,
                               include_sample_dir=verify_include_sample_dir):
        print "Expected Fastq outputs already present"
        skip_rsync = True
        skip_fastq_generation = True
    # Check if there's anything to do
    if (skip_rsync and skip_fastq_generation) and \
       not (generate_stats or analyse_barcodes):
        print "Nothing to do"
        return
    # Log dir
    log_dir = 'make_fastqs'
    if protocol != 'standard':
        log_dir += "_%s" % protocol
    if lanes:
        log_dir += "_L%s" % ''.join([str(l) for l in sorted(lanes)])
    ap.set_log_dir(ap.get_log_subdir(log_dir))
    # Fetch primary data
    if not skip_rsync and not ap.params.acquired_primary_data:
        if get_primary_data(ap) != 0:
            logger.error("Failed to acquire primary data")
            raise Exception("Failed to acquire primary data")
        else:
            ap.params['acquired_primary_data'] = True
    if only_fetch_primary_data:
        return
    # Deal with platform information
    if not platform:
        platform = ap.metadata.platform
    # Do fastq generation using the specified protocol
    if not skip_fastq_generation:
        # Set primary data location and report info
        primary_data_dir = os.path.join(ap.params.primary_data_dir,
                                        os.path.basename(ap.params.data_dir))
        print "Primary data dir      : %s" % primary_data_dir
        try:
            illumina_run = IlluminaData.IlluminaRun(primary_data_dir,
                                                    platform=platform)
        except IlluminaData.IlluminaDataPlatformError as ex:
            logger.critical("Error loading primary data: %s" % ex)
            if platform is None:
                logger.critical("Try specifying platform using --platform?")
            else:
                logger.critical("Check specified platform is valid (or "
                                "omit --platform")
            raise Exception("Error determining sequencer platform")
        print "Platform              : %s" % illumina_run.platform
        print "Bcl format            : %s" % illumina_run.bcl_extension
        # Set platform in metadata
        ap.metadata['platform'] = illumina_run.platform
        # Bases mask
        if bases_mask is not None:
            ap.params['bases_mask'] = bases_mask
        bases_mask = ap.params.bases_mask
        print "Bases mask setting    : %s" % bases_mask
        if protocol not in (
                '10x_chromium_sc',
                '10x_chromium_sc_atac',
        ):
            if bases_mask == "auto":
                print "Determining bases mask from RunInfo.xml"
                bases_mask = get_bases_mask(illumina_run.runinfo_xml,
                                            sample_sheet)
                if not bases_mask_is_valid(bases_mask):
                    raise Exception("Invalid bases mask: '%s'" % bases_mask)
        # Do fastq generation according to protocol
        if protocol == 'icell8':
            # ICell8 data
            # Update bcl2fastq settings appropriately
            print "Updating read trimming and masking for ICell8"
            minimum_trimmed_read_length = 21
            mask_short_adapter_reads = 0
            # Reset the default bases mask
            bases_mask = IlluminaData.IlluminaRunInfo(
                illumina_run.runinfo_xml).bases_mask
            bases_mask = get_icell8_bases_mask(bases_mask,
                                               sample_sheet=sample_sheet)
            if not bases_mask_is_valid(bases_mask):
                raise Exception("Invalid bases mask: '%s'" % bases_mask)
            # Switch to standard protocol
            protocol = 'standard'
        if protocol == 'standard':
            # Standard protocol
            try:
                exit_code = bcl_to_fastq(
                    ap,
                    unaligned_dir=ap.params.unaligned_dir,
                    sample_sheet=sample_sheet,
                    primary_data_dir=primary_data_dir,
                    require_bcl2fastq=require_bcl2fastq_version,
                    bases_mask=bases_mask,
                    ignore_missing_bcl=ignore_missing_bcl,
                    ignore_missing_stats=ignore_missing_stats,
                    no_lane_splitting=no_lane_splitting,
                    minimum_trimmed_read_length=minimum_trimmed_read_length,
                    mask_short_adapter_reads=mask_short_adapter_reads,
                    nprocessors=nprocessors,
                    runner=runner)
            except Exception as ex:
                raise Exception("Bcl2fastq stage failed: '%s'" % ex)
        elif protocol == '10x_chromium_sc':
            # 10xGenomics Chromium SC
            if bases_mask == 'auto':
                bases_mask = None
            try:
                # Check we have cellranger
                cellranger = find_program('cellranger')
                if not cellranger:
                    raise Exception("No cellranger package found")
                cellranger_software_info = cellranger_info(cellranger)
                print "Using cellranger %s: %s" % \
                    (cellranger_software_info[-1],
                     cellranger)
                # Check we have bcl2fastq
                bcl2fastq = find_program('bcl2fastq')
                if not bcl2fastq:
                    raise Exception("No bcl2fastq package found")
                bcl2fastq = available_bcl2fastq_versions(
                    paths=(os.path.dirname(bcl2fastq), ), reqs='>=2.17')
                if not bcl2fastq:
                    raise Exception("No appropriate bcl2fastq software "
                                    "located")
                bcl2fastq = bcl2fastq[0]
                bcl2fastq_info = bcl_to_fastq_info(bcl2fastq)
                print "Using bcl2fastq %s: %s" % (bcl2fastq_info[-1],
                                                  bcl2fastq)
                # Store info on bcl2fastq package
                ap.metadata['bcl2fastq_software'] = bcl2fastq_info
                # Store info on cellranger package
                ap.metadata['cellranger_software'] = cellranger_software_info
                # Put a copy of sample sheet in the log directory
                shutil.copy(sample_sheet, ap.log_dir)
                # Determine output directory absolute path
                output_dir = ap.params.unaligned_dir
                if not os.path.isabs(output_dir):
                    output_dir = os.path.join(ap.analysis_dir, output_dir)
                # Run cellranger mkfastq
                exit_code = run_cellranger_mkfastq(
                    sample_sheet=sample_sheet,
                    primary_data_dir=primary_data_dir,
                    output_dir=output_dir,
                    lanes=(None if lanes is None else ','.join(
                        [str(l) for l in lanes])),
                    bases_mask=bases_mask,
                    cellranger_exe=cellranger,
                    cellranger_jobmode=cellranger_jobmode,
                    cellranger_maxjobs=cellranger_maxjobs,
                    cellranger_mempercore=cellranger_mempercore,
                    cellranger_jobinterval=cellranger_jobinterval,
                    cellranger_localcores=cellranger_localcores,
                    cellranger_localmem=cellranger_localmem,
                    working_dir=ap.analysis_dir,
                    log_dir=ap.log_dir)
            except Exception as ex:
                raise Exception("'cellranger mkfastq' stage failed: "
                                "'%s'" % ex)
            # Turn off barcode analysis
            analyse_barcodes = False
        elif protocol == '10x_chromium_sc_atac':
            # 10xGenomics Chromium scATAC-seq
            exit_code = bcl_to_fastq_10x_chromium_sc_atac(
                ap,
                output_dir=ap.params.unaligned_dir,
                sample_sheet=sample_sheet,
                primary_data_dir=primary_data_dir,
                lanes=lanes,
                bases_mask=bases_mask,
                cellranger_jobmode=cellranger_jobmode,
                cellranger_maxjobs=cellranger_maxjobs,
                cellranger_mempercore=cellranger_mempercore,
                cellranger_jobinterval=cellranger_jobinterval,
                cellranger_localcores=cellranger_localcores,
                cellranger_localmem=cellranger_localmem,
                log_dir=ap.log_dir)
            # Turn off barcode analysis
            analyse_barcodes = False
        else:
            # Unknown protocol
            raise Exception("Unknown protocol '%s'" % protocol)
        # Check the outputs
        if exit_code != 0:
            raise Exception("Fastq generation finished with error: "
                            "exit code %d" % exit_code)
        if not verify_fastq_generation(
                ap, lanes=lanes, include_sample_dir=verify_include_sample_dir):
            # Check failed
            logger.error("Failed to verify output Fastqs against "
                         "sample sheet")
            # Try to load the data from unaligned dir
            try:
                illumina_data = IlluminaData.IlluminaData(
                    ap.analysis_dir, unaligned_dir=ap.params.unaligned_dir)
            except IlluminaData.IlluminaDataError as ex:
                raise Exception("Unable to load data from %s: %s" %
                                (ap.params.unaligned_dir, ex))
            # Generate a list of missing Fastqs
            missing_fastqs = IlluminaData.list_missing_fastqs(
                illumina_data,
                sample_sheet,
                include_sample_dir=verify_include_sample_dir)
            assert (len(missing_fastqs) > 0)
            missing_fastqs_file = os.path.join(ap.log_dir,
                                               "missing_fastqs.log")
            print "Writing list of missing Fastq files to %s" % \
                missing_fastqs_file
            with open(missing_fastqs_file, 'w') as fp:
                for fq in missing_fastqs:
                    fp.write("%s\n" % fq)
            # Create empty FASTQs
            if create_empty_fastqs is None:
                try:
                    create_empty_fastqs = \
                        ap.settings.platform[ap.metadata.platform].\
                        create_empty_fastqs
                except (KeyError, AttributeError):
                    pass
            if create_empty_fastqs is None:
                create_empty_fastqs = \
                    ap.settings.bcl2fastq.create_empty_fastqs
            if create_empty_fastqs:
                logger.warning("Making 'empty' placeholder Fastqs")
                for fq in missing_fastqs:
                    fastq = os.path.join(ap.analysis_dir,
                                         ap.params.unaligned_dir, fq)
                    print "-- %s" % fastq
                    if not os.path.exists(os.path.dirname(fastq)):
                        mkdirs(os.path.dirname(fastq))
                    with gzip.GzipFile(filename=fastq, mode='wb') as fp:
                        fp.write('')
            else:
                raise Exception("Fastq generation failed to produce "
                                "expected outputs")
    # Generate statistics
    if generate_stats:
        fastq_statistics(ap,
                         stats_file=stats_file,
                         per_lane_stats_file=per_lane_stats_file,
                         unaligned_dir=ap.params.unaligned_dir,
                         nprocessors=nprocessors,
                         runner=runner)
    # Run barcode analysis
    if analyse_barcodes:
        # Determine output directory
        if barcode_analysis_dir is not None:
            ap.params['barcode_analysis_dir'] = barcode_analysis_dir
        elif ap.params.barcode_analysis_dir is None:
            ap.params['barcode_analysis_dir'] = 'barcode_analysis'
        barcode_analysis_dir = ap.params.barcode_analysis_dir
        if not os.path.isabs(barcode_analysis_dir):
            barcode_analysis_dir = os.path.join(ap.params.analysis_dir,
                                                barcode_analysis_dir)
        # Report title
        title = "Barcode analysis for %s" % ap.metadata.run_name
        # Log file
        log_file = os.path.join(ap.log_dir, "analyse_barcodes.log")
        # Set up runner
        if runner is None:
            runner = ap.settings.general.default_runner
        runner.set_log_dir(ap.log_dir)
        # Get scheduler parameters
        max_jobs = ap.settings.general.max_concurrent_jobs
        poll_interval = ap.settings.general.poll_interval
        # Create and run barcode analysis pipeline
        barcode_analysis = AnalyseBarcodes(
            os.path.join(ap.params.analysis_dir, ap.params.unaligned_dir))
        barcode_analysis.run(barcode_analysis_dir,
                             title=title,
                             lanes=lanes,
                             sample_sheet=sample_sheet,
                             log_file=log_file,
                             runner=runner,
                             max_jobs=max_jobs,
                             poll_interval=poll_interval,
                             verbose=False)
    # Make a 'projects.info' metadata file
    if lanes:
        ap.update_project_metadata_file()
    else:
        ap.make_project_metadata_file()
    # Remove primary data
    if remove_primary_data:
        remove_primary_data(ap)