def load_illumina_data(self, unaligned_dir=None): # Load and return an IlluminaData object if unaligned_dir is None: unaligned_dir = self.params.unaligned_dir if unaligned_dir is None: logging.error( "Unaligned directory not specified, cannot load data") return None return IlluminaData.IlluminaData(self.analysis_dir, unaligned_dir=unaligned_dir)
def get_fastqs_from_dir(dirn, lane, unaligned_dir=None): """Automatically collect Fastq files for specified lane """ try: illumina_data = IlluminaData.IlluminaData(dirn, unaligned_dir=unaligned_dir) except Exception, ex: sys.stderr.write("Unable to read fastqs from %s: %s\n" % (dirn, ex)) sys.exit(1)
def verify_fastq_generation(ap, unaligned_dir=None, lanes=None, include_sample_dir=False): """Check that generated Fastqs match sample sheet predictions Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to do Fastqs verification on unaligned_dir (str): explicitly specify the bcl2fastq output directory to check lanes (list): specify a list of lane numbers (integers) to check (others will be ignored) include_sample_dir (bool): if True then include a 'sample_name' directory level when checking for bcl2fastq2 outputs, even if one shouldn't be present Returns: True if outputs match sample sheet, False otherwise. """ if unaligned_dir is None: if ap.params.unaligned_dir is not None: unaligned_dir = ap.params.unaligned_dir else: raise Exception("Bcl2fastq output directory not defined") print "Checking bcl2fastq output directory '%s'" % unaligned_dir bcl_to_fastq_dir = os.path.join(ap.analysis_dir, unaligned_dir) if not os.path.isdir(bcl_to_fastq_dir): # Directory doesn't exist return False # Make a temporary sample sheet to verify against tmp_sample_sheet = os.path.join( ap.tmp_dir, "SampleSheet.verify.%s.csv" % time.strftime("%Y%m%d%H%M%S")) make_custom_sample_sheet(ap.params.sample_sheet, tmp_sample_sheet, lanes=lanes) # Try to create an IlluminaData object try: illumina_data = IlluminaData.IlluminaData(ap.analysis_dir, unaligned_dir=unaligned_dir) except IlluminaData.IlluminaDataError as ex: # Failed to initialise logger.warning("Failed to get information from %s: %s" % (bcl_to_fastq_dir, ex)) return False # Do check return IlluminaData.verify_run_against_sample_sheet( illumina_data, tmp_sample_sheet, include_sample_dir=include_sample_dir)
def get_fastqs_from_dir(dirn, lane, unaligned_dir=None): """ Collect Fastq files for specified lane Arguments: dirn (str): path to directory to collect Fastq files from lane (int): lane Fastqs must have come from unaligned_dir (str): subdirectory of 'dirn' with outputs from bcl2fastq Returns: List: list of Fastqs (for single ended data) or of Fastq pairs (for pair ended data). """ try: illumina_data = IlluminaData.IlluminaData(dirn, unaligned_dir=unaligned_dir) except Exception as ex: raise Exception("Unable to read fastqs from %s: %s\n" % (dirn, ex)) paired_end = illumina_data.paired_end fastqs_r1 = [] fastqs_r2 = [] for project in illumina_data.projects: for sample in project.samples: for fastq in sample.fastq_subset(read_number=1, full_path=True): if IlluminaData.IlluminaFastq(fastq).lane_number == lane: fastqs_r1.append(fastq) for fastq in sample.fastq_subset(read_number=2, full_path=True): if IlluminaData.IlluminaFastq(fastq).lane_number == lane: fastqs_r2.append(fastq) if illumina_data.undetermined: for sample in illumina_data.undetermined.samples: for fastq in sample.fastq_subset(read_number=1, full_path=True): if IlluminaData.IlluminaFastq(fastq).lane_number == lane: fastqs_r1.append(fastq) for fastq in sample.fastq_subset(read_number=2, full_path=True): if IlluminaData.IlluminaFastq(fastq).lane_number == lane: fastqs_r2.append(fastq) if not paired_end: return fastqs_r1 fastqs = [] fastqs_r1.sort() fastqs_r2.sort() for fq1, fq2 in zip(fastqs_r1, fastqs_r2): fastqs.append("%s,%s" % (fq1, fq2)) return fastqs
def detect_unaligned_dir(self): # Attempt to detect an existing 'bcl2fastq' or 'Unaligned' directory # containing data from bcl2fastq for test_unaligned in ('bcl2fastq', 'Unaligned'): if os.path.isdir(os.path.join(self.analysis_dir, test_unaligned)): logging.debug( "Testing subdirectory '%s' to see if it has sequence data" % test_unaligned) try: IlluminaData.IlluminaData(self.analysis_dir, unaligned_dir=test_unaligned) print("Setting 'unaligned_dir' parameter to %s" % test_unaligned) return test_unaligned except IlluminaData.IlluminaDataError as ex: logging.debug("Unable to load data from %s" % test_unaligned) # Unable to detect existing data directory return None
"'NY_ChIP-seq'. Use multiple --expt=... to set the types for different " "projects") p.add_option("--keep-names",action="store_true",dest="keep_names",default=False, help="preserve the full names of the source fastq files when creating links") p.add_option("--merge-replicates",action="store_true",dest="merge_replicates",default=False, help="create merged fastq files for each set of replicates detected") # Parse command line options,args = p.parse_args() # Get data directory name if len(args) != 1: p.error("expected one argument (location of Illumina analysis dir)") illumina_analysis_dir = os.path.abspath(args[0]) # Populate Illumina data object illumina_data = IlluminaData.IlluminaData(illumina_analysis_dir, unaligned_dir=options.unaligned_dir) # Assign experiment types for expt in options.expt_type: name,type_ = expt.split(':') illumina_data.get_project(name).expt_type = type_ # Create and populate per-project directory structure for project in illumina_data.projects: create_analysis_dir(project, top_dir=illumina_analysis_dir, merge_replicates=options.merge_replicates, keep_names=options.keep_names, dry_run=options.dry_run)
def get_analysis_projects_from_dirs(self, pattern=None, strict=False): """ Return a list of AnalysisProjects in the analysis directory Tests each of the subdirectories in the top-level of the analysis directory and rejects any that appear to be CASVAVA/bcl2fastq outputs or which don't successfully load as AnalysisProject instances. Unlike the `get_analysis_projects` method, no checking against the project metadata (typically in 'projects.info') is performed. If the 'pattern' is not None then it should be a simple pattern used to match against available names to select a subset of projects (see bcf_utils.name_matches). Arguments: pattern (str): optional pattern to select a subset of projects (default: select all projects) strict (bool): if True then apply strict checks on each discovered project directory before adding it to the list (default: don't apply strict checks) Returns: List: list of AnalysisProject instances. """ logging.debug("Testing subdirectories to determine analysis projects") projects = [] if pattern is None: pattern = '*' # Try loading each subdirectory as a project for dirn in bcf_utils.list_dirs(self.analysis_dir): # Test for bcl2fastq output try: IlluminaData.IlluminaData(self.analysis_dir, unaligned_dir=dirn) logging.debug("* %s: rejected" % dirn) continue except IlluminaData.IlluminaDataError: pass except Exception as ex: logging.debug("Exception when attempting to load " "subdir '%s' as CASAVA/bcl2fastq output " "(ignored): %s" % (dirn, ex)) # Try loading as a project test_project = AnalysisProject( dirn, os.path.join(self.analysis_dir, dirn)) if strict: # Apply strict checks if not test_project.is_analysis_dir: logging.debug("* %s: rejected (failed strict checks)" % dirn) continue else: # Basic check: are there any samples? if not len(test_project.samples): logging.debug("* %s: rejected (no samples)" % dirn) continue # Passed checks logging.debug("* %s: analysis directory" % dirn) if bcf_utils.name_matches(test_project.name, pattern): projects.append(test_project) return projects
def update_project_metadata_file(self, unaligned_dir=None, project_metadata_file='projects.info'): """ Update project metadata file from bcl2fastq outputs Updates the contents of the project metadata file (default: "projects.info") from a bcl-to-fastq output directory, by adding new entries for projects in the bcl-to-fastq outputs which don't currently appear. Arguments: unaligned_dir (str): path to the bcl-to-fastq output directory relative to the analysis dir. Defaults to the unaligned dir stored in the analysis directory parameter file. project_metatadata_file (str): optional, path to the project metadata file to update """ if project_metadata_file is not None: self.params['project_metadata'] = project_metadata_file logging.debug("Project metadata file: %s" % self.params.project_metadata) filen = os.path.join(self.analysis_dir, self.params.project_metadata) if unaligned_dir is not None: self.params['unaligned_dir'] = unaligned_dir logging.debug("Unaligned_dir: %s" % self.params.unaligned_dir) illumina_data = IlluminaData.IlluminaData( self.analysis_dir, unaligned_dir=self.params.unaligned_dir) if os.path.exists(filen): # Load data from existing file logging.debug("Loading project metadata from existing file: %s" % filen) project_metadata = ProjectMetadataFile(filen) else: # New (empty) metadata file logging.debug("Creating new project metadata file: %s" % filen) project_metadata = ProjectMetadataFile() # Get projects and samples projects = {} for project in illumina_data.projects: projects[project.name] = sorted([s.name for s in project.samples]) # Add data from metadata file for line in project_metadata: project_name = line['Project'] project_is_commented = project_name.startswith('#') # Uncomment project line for now project_name = project_name.lstrip('#') # Add to the list if not found if project_name not in projects: if project_is_commented or \ not os.path.exists(os.path.join(self.analysis_dir, project_name)): # Comment out project not in latest list # if already commented or if project directory # doesn't exist project_name = "#%s" % project_name projects[project_name] = line['Samples'].split(',') # Populate/update for project_name in projects: sample_names = projects[project_name] if project_name not in project_metadata: project_metadata.add_project(project_name, sample_names) else: project_metadata.update_project(project_name, sample_names=sample_names) # Save project_metadata.save(filen) print("Updated project metadata file '%s'" % self.params.project_metadata)
else: lanes = [] for line in sample_sheet: lane = int(line['Lane']) if lane not in lanes: lanes.append(lane) barcodes = get_barcodes_from_sample_sheet(sample_sheet, lanes=lanes, length=options.length) match_barcodes(counts,barcodes, nseqs=options.n, max_mismatches=options.mismatches, cutoff=options.cutoff, fp=fp) elif len(args) == 1 and os.path.isdir(args[0]): # Dealing with a bclToFastq output dir illumina_data = IlluminaData.IlluminaData(os.path.dirname(args[0]), unaligned_dir=os.path.basename(args[0])) # Assign fastqs to lanes (R1 only) fastq_in_lane = dict() for p in illumina_data.projects: for s in p.samples: for f in s.fastq_subset(read_number=1,full_path=True): lane = IlluminaData.IlluminaFastq(f).lane_number if lane not in fastq_in_lane: fastq_in_lane[lane] = [] fastq_in_lane[lane].append(f) if illumina_data.undetermined: for s in illumina_data.undetermined.samples: for f in s.fastq_subset(read_number=1,full_path=True): lane = IlluminaData.IlluminaFastq(f).lane_number if lane not in fastq_in_lane: fastq_in_lane[lane] = []
def __init__(self, unaligned_dir=None): """ Create a new AnalyseBarcodes pipeline instance Arguments: unaligned_dir (str): path to the directory with outputs from bcl2fastq """ # Initialise the pipeline superclass Pipeline.__init__(self, name="Analyse Barcodes") # Define parameters self.add_param('barcode_analysis_dir', type=str) self.add_param('counts_dir', type=str) self.add_param('title', type=str) self.add_param('lanes', type=list) self.add_param('sample_sheet', type=str) self.add_param('bases_mask', type=str) self.add_param('mismatches', type=int) self.add_param('cutoff', type=float) self.add_param('force', type=bool, value=False) # Load data from bcl2fastq output if not os.path.exists(unaligned_dir): raise OSError("'%s': not found" % unaligned_dir) analysis_dir = os.path.abspath(os.path.dirname(unaligned_dir)) unaligned_dir = os.path.basename(unaligned_dir) illumina_data = IlluminaData.IlluminaData(analysis_dir, unaligned_dir=unaligned_dir) # Example Fastq file used for determining mismatches in # absence of bases mask example_fastq = illumina_data.projects[0].samples[0].fastq_subset( read_number=1, full_path=True)[0] #################### # Build the pipeline #################### # Setup barcode analysis and counts directories setup_barcode_analysis_dir = SetupBarcodeAnalysisDirs( "Setup barcode analysis directory", self.params.barcode_analysis_dir, self.params.counts_dir, force=self.params.force) self.add_task(setup_barcode_analysis_dir) # Generate counts for Fastqs in each project count_tasks = [] for project in illumina_data.projects: count_barcodes = CountBarcodes("Count barcodes in '%s'" % project.name, project, self.params.counts_dir, lanes=self.params.lanes) self.add_task(count_barcodes, requires=(setup_barcode_analysis_dir, )) count_tasks.append(count_barcodes) # Generate counts for undetermined Fastqs if illumina_data.undetermined is not None: count_barcodes = CountBarcodes("Count barcodes in 'undetermined'", illumina_data.undetermined, self.params.counts_dir, lanes=self.params.lanes, use_project_name="undetermined") self.add_task(count_barcodes, requires=(setup_barcode_analysis_dir, )) count_tasks.append(count_barcodes) # List the counts files list_counts_files = ListBarcodeCountFiles( "Fetch the barcode counts files", self.params.counts_dir) self.add_task(list_counts_files, requires=count_tasks) # Analyse counts and report the results report_barcodes = ReportBarcodeAnalysis( "Report barcode analysis", list_counts_files.output.counts_files, self.params.barcode_analysis_dir, sample_sheet=self.params.sample_sheet, lanes=self.params.lanes, mismatches=self.params.mismatches, cutoff=self.params.cutoff, title=self.params.title) self.add_task(report_barcodes, requires=(list_counts_files, )) # Add final outputs to the pipeline self.add_output('report_file', report_barcodes.output.report_file) self.add_output('xls_file', report_barcodes.output.xls_file) self.add_output('html_file', report_barcodes.output.html_file)
"when creating links") p.add_argument("--merge-replicates",action="store_true", dest="merge_replicates",default=False, help="create merged fastq files for each set of " "replicates detected") p.add_argument('illumina_data_dir', help="top-level directory containing the 'Unaligned' " "directory with the fastq.gz files") # Parse command line args = p.parse_args() # Get data directory name illumina_analysis_dir = os.path.abspath(args.illumina_data_dir) # Populate Illumina data object illumina_data = IlluminaData.IlluminaData(illumina_analysis_dir, unaligned_dir=args.unaligned_dir) # Assign experiment types for expt in args.expt_type: name,type_ = expt.split(':') illumina_data.get_project(name).expt_type = type_ # Create and populate per-project directory structure for project in illumina_data.projects: create_analysis_dir(project, top_dir=illumina_analysis_dir, merge_replicates=args.merge_replicates, keep_names=args.keep_names, dry_run=args.dry_run)
logging.fatal("No file '%s': cannot update" % existing_stats_file) sys.exit(1) else: existing_stats_file = None # Ignore 'force' if options.force: logger.warn("ignoring deprecated option '--force'") # Handle debugging output if requested if options.debug: logging.getLogger("auto_process_ngs").setLevel(logging.DEBUG) # Get the data from FASTQ files try: illumina_data = IlluminaData.IlluminaData( args[0], unaligned_dir=options.unaligned_dir) except IlluminaData.IlluminaDataError, ex: logger.critical("failed to get data from %s: %s" % (args[0], ex)) sys.exit(1) # Generate statistics for fastq files stats = FastqStatistics(illumina_data, n_processors=options.n, add_to=existing_stats_file) stats.report_full_stats(options.full_stats_file) print "Full statistics written to %s" % options.full_stats_file stats.report_basic_stats(options.stats_file) print "Basic statistics written to %s" % options.stats_file stats.report_per_lane_sample_stats(options.per_lane_sample_stats_file) print "Per-lane sample statistics written to %s" % \ options.per_lane_sample_stats_file stats.report_per_lane_summary_stats(options.per_lane_stats_file)
def merge_fastq_dirs(ap, primary_unaligned_dir, output_dir=None, dry_run=False): """ Combine multiple 'unaligned' output directories into one This method combines the output from multiple runs of CASAVA/bcl2fastq into a single 'unaligned'-equivalent directory. Currently it operates in an automatic mode and should detect additional 'unaligned' dirs on its own. Arguments: ap (AutoProcessor): autoprocessor pointing to the parent analysis directory primary_unaligned_dir (str): the 'unaligned' dir that data from from all others will be put into (relative path), unless overridden by 'output_dir' argument output_dir (str): optional, new 'unaligned' dir that will be created to hold merged data (relative path, defaults to 'primary_unaligned_dir') dry_run (boolean): if True then just report operations that would have been performed. """ if primary_unaligned_dir is None: raise Exception("Primary unaligned dir not defined") # Output directory if output_dir is None: output_dir = primary_unaligned_dir print("Fastqs will be merged into '%s'" % output_dir) # Collect unaligned dirs print("Collecting bcl2fastq directories") primary_illumina_data = None unaligned_dirs = {} for dirn in list_dirs(ap.analysis_dir): try: illumina_data = IlluminaData.IlluminaData(ap.analysis_dir, unaligned_dir=dirn) if dirn == primary_unaligned_dir: print("* %s (primary dir)" % dirn) primary_illumina_data = illumina_data elif dirn.endswith(".bak") or dirn.startswith("save."): print("Ignoring %s" % dirn) else: print("* %s" % dirn) unaligned_dirs[dirn] = illumina_data except Exception as ex: logger.debug("Rejecting %s: %s" % (dirn, ex)) # Check primary unaligned dir if primary_illumina_data is None: raise Exception("Primary dir '%s' doesn't exist, or doesn't " "contain data?" % primary_unaligned_dir) # Is there anything to do? if not unaligned_dirs: print("No extra bcl2fastq output directories found, nothing to do") return 0 # Make log directory and set up scheduler (if not dry run) if not dry_run: ap.set_log_dir(ap.get_log_subdir('merge_fastq_dirs')) runner = ap.settings.general.default_runner runner.set_log_dir(ap.log_dir) sched = SimpleScheduler( runner=runner, max_concurrent=ap.settings.general.max_concurrent_jobs, poll_interval=ap.settings.general.poll_interval) sched.start() jobs = [] # Top-level for undetermined reads if primary_illumina_data.undetermined.dirn != \ primary_illumina_data.unaligned_dir: undetermined_dir = os.path.basename( primary_illumina_data.undetermined.dirn) else: undetermined_dir = None # Do sanity checks before proceeding print("Checking primary data directory") fmt = primary_illumina_data.format paired_end = primary_illumina_data.paired_end no_lane_splitting = (len(primary_illumina_data.lanes) == 1) \ and (primary_illumina_data.lanes[0] is None) print("* Format: %s" % fmt) print("* no-lane-splitting: %s" % ('yes' if no_lane_splitting else 'no')) print("* paired-end: %s" % ('yes' if paired_end else 'no')) print("* undetermined dir: %s" % undetermined_dir) consistent_data = True for unaligned_dir in unaligned_dirs: illumina_data = unaligned_dirs[unaligned_dir] fmt0 = illumina_data.format no_lane_splitting0 = (len(illumina_data.lanes) == 1) \ and (primary_illumina_data.lanes[0] is None) if (fmt0 != fmt) or (no_lane_splitting0 != no_lane_splitting): print("!!! %s: inconsistent format to primary data dir !!!" % unaligned_dir) consistent_data = False if not consistent_data: raise Exception("Data directories not consistent with primary " "dir '%s'" % primary_unaligned_dir) # Collect the projects from the extra directories projects = [] undetermined = [] for unaligned_dir in unaligned_dirs: print("Examining projects in %s:" % unaligned_dir) illumina_data = unaligned_dirs[unaligned_dir] for project in illumina_data.projects: if not list(filter(lambda p: p.name == project.name, projects)): print("- %s: will be merged in" % project.name) projects.append(project) else: raise Exception("collision: %s already exists" % project.name) # Deal with undetermined reads if illumina_data.undetermined is not None: print("Examining undetermined samples:") if no_lane_splitting: # No lane info: should merge undetermined fastqs for sample in illumina_data.undetermined.samples: print("- %s: reads will be concatenated" % sample.name) undetermined.append(sample) else: for sample in illumina_data.undetermined.samples: if not list( filter(lambda s: s.name == sample.name, undetermined)): print("- %s: will be merged in" % sample.name) undetermined.append(sample) else: raise Exception("collision: %s already exists" % sample.name) else: print("No undetermined samples") # Collect any remaining projects from the primary # unaligned directory print("Examining projects in primary dir %s:" % primary_unaligned_dir) for project in primary_illumina_data.projects: if not list(filter(lambda p: p.name == project.name, projects)): print("- %s: will be merged in" % project.name) projects.append(project) else: print("- %s: already exists, will be discarded" % project.name) # Sort out the undetermined reads print("Examining undetermined samples:") if no_lane_splitting: # No lane info: should merge undetermined fastqs for sample in primary_illumina_data.undetermined.samples: print("- %s: reads will be concatenated" % sample.name) undetermined.insert(0, sample) else: for sample in primary_illumina_data.undetermined.samples: if not list(filter(lambda s: s.name == sample.name, undetermined)): print("- %s: will be merged in" % sample.name) undetermined.insert(0, sample) else: print("- %s: already exists, will be discarded" % sample.name) # Make a new directory for the merging merge_dir = os.path.join(ap.analysis_dir, output_dir + ".new") if undetermined_dir is not None: merge_undetermined_dir = os.path.join(merge_dir, undetermined_dir) else: merge_undetermined_dir = merge_dir if not dry_run: print("Making temporary merge directory %s" % merge_dir) mkdir(merge_dir) if not os.path.exists(merge_undetermined_dir): print("Making directory for undetermined %s" % merge_undetermined_dir) mkdir(merge_undetermined_dir) # Copy the projects print("Importing projects:") for project in projects: print("- %s" % project.name) project_dir = os.path.join(merge_dir, os.path.basename(project.dirn)) cmd = copytree_command(project.dirn, project_dir) print("- Running %s" % cmd) if not dry_run: job = sched.submit(cmd, name="copy_project.%s" % project.name, wd=merge_dir) print("Job: %s" % job) jobs.append(job) # Handle the undetermined reads print("Dealing with undetermined reads:") if no_lane_splitting: # No lane info: merge undetermined fastqs if len(undetermined) == 1: # Only one undetermined sample - copy Fastqs for read in (1, 2): if read == 2 and not paired_end: break fastqs = sample.fastq_subset(read_number=read, full_path=True) for fq in fastqs: cmd = copy_command(fq, merge_undetermined_dir) print("- Running %s" % cmd) if not dry_run: job = sched.submit(cmd, name="copy_undetermined.R%s" % read, wd=merge_dir) print("Job: %s" % job) jobs.append(job) else: # Multiple undetermined samples - concat Fastqs for read in (1, 2): if read == 2 and not paired_end: break cmd = Command('concat_fastqs.py') for sample in undetermined: fastqs = sample.fastq_subset(read_number=read, full_path=True) cmd.add_args(*fastqs) cmd.add_args( os.path.join(merge_undetermined_dir, "Undetermined_S0_R%s_001.fastq.gz" % read)) print("- Running %s" % cmd) if not dry_run: job = sched.submit(cmd, name="merge_undetermined.R%s" % read, wd=merge_dir) print("Job: %s" % job) jobs.append(job) else: for sample in undetermined: print("- %s" % sample.name) if fmt == "bcl2fastq2": # Hardlink copy fastqs directly sample_dir = merge_undetermined_dir if not dry_run: for fq in sample.fastq: src_fq = os.path.join(sample.dirn, fq) dst_fq = os.path.join(sample_dir, fq) os.link(src_fq, dst_fq) else: # Just copy directory tree wholesale sample_dir = os.path.join(merge_undetermined_dir, os.path.basename(sample.dirn)) cmd = copytree_command(sample.dirn, sample_dir) print("- Running %s" % cmd) if not dry_run: job = sched.submit(cmd, name="copy_sample_dir.%s" % sample.name, wd=merge_dir) print("Job: %s" % job.name) jobs.append(job) # Make expected subdirs for bcl2fastq2 if not dry_run and fmt == "bcl2fastq2": for dirn in ('Reports', 'Stats'): mkdir(os.path.join(merge_dir, dirn)) # Add a hidden placeholder to preserve these directories # on rsync -m (prune empty dirs) with open(os.path.join(merge_dir, dirn, '.placeholder'), 'w') as fp: fp.write("") # Wait for scheduler jobs to complete if not dry_run: sched.wait() sched.stop() # Check job exit status exit_status = 0 for j in jobs: exit_status += j.exit_status if j.exit_status != 0: logger.warning("Job failed: %s" % j) if exit_status: logger.critical("One or more jobs failed (non-zero " "exit status)") return exit_status # Move all the 'old' directories out of the way all_unaligned = [u for u in unaligned_dirs] all_unaligned.append(primary_unaligned_dir) for unaligned_dir in all_unaligned: unaligned_backup = os.path.join(ap.analysis_dir, "save.%s" % unaligned_dir) print("Moving %s to %s" % (unaligned_dir, unaligned_backup)) if not dry_run: shutil.move(os.path.join(ap.analysis_dir, unaligned_dir), unaligned_backup) # Rename the merged directory print("Renaming %s to %s" % (merge_dir, output_dir)) if not dry_run: shutil.move(merge_dir, os.path.join(ap.analysis_dir, output_dir)) # Reset the bcl2fastq dir if not dry_run: ap.params['unaligned_dir'] = output_dir # Make a new 'projects.info' metadata file project_metadata_file = os.path.join(ap.analysis_dir, 'projects.info') if os.path.exists(project_metadata_file): print("Moving existing projects.info file out of the way") if not dry_run: os.rename(project_metadata_file, os.path.join(ap.analysis_dir, 'save.projects.info')) print("Creating new projects.info file") if not dry_run: ap.make_project_metadata_file() return 0
def __init__(self, analysis_dir): """Create a new AnalysisDir instance for a specified directory Arguments: analysis_dir: name (and path) to analysis directory """ # Store location self._analysis_dir = os.path.abspath(analysis_dir) self._name = os.path.basename(analysis_dir) self._bcl2fastq_dirs = [] self._project_dirs = [] self._extra_dirs = [] self.sequencing_data = [] self.projects = [] self.undetermined = None # Metadata self.metadata = AnalysisDirMetadata() try: metadata_file = os.path.join(self._analysis_dir, "metadata.info") self.metadata.load(metadata_file) except Exception as ex: logger.warning("Failed to load metadata file %s: %s" % (metadata_file, ex)) logger.warning("Attempting to load parameter file") try: params = AnalysisDirParameters() parameter_file = os.path.join(self._analysis_dir, "auto_process.info") params.load(parameter_file, strict=False) # Attempt to acquire values from parameters for param in ('platform', 'run_number', 'source', 'assay'): if param not in params: print "-- %s: missing" % param continue print "-- %s: setting to '%s'" % (param, params[param]) self.metadata[param] = params[param] except Exception as ex: # No parameter file either logger.warning("Failed to load parameters: %s" % ex) logger.warning("Perhaps this is not an auto_process project?") raise ex # Projects metadata try: self.projects_metadata = ProjectMetadataFile( os.path.join(self._analysis_dir, "projects.info")) except Exception as ex: logger.warning("Failed to load projects metadata: %s" % ex) self.projects_metadata = None # Run name try: self.run_name = self.metadata.run except AttributeError: self.run_name = self._analysis_dir[0:-len('_analysis')] self.run_name = os.path.basename(self.run_name) self.date_stamp,\ self.instrument_name,\ self.instrument_run_number = IlluminaData.split_run_name( self.run_name) # Look for outputs from bclToFastq and analysis projects logger.debug("Examining subdirectories of %s" % self._analysis_dir) for dirn in bcf_utils.list_dirs(self._analysis_dir): # Look for sequencing data try: data = IlluminaData.IlluminaData(self._analysis_dir, unaligned_dir=dirn) logger.debug("- %s: sequencing data" % dirn) self._bcl2fastq_dirs.append(dirn) self.sequencing_data.append(data) continue except IlluminaData.IlluminaDataError: pass except Exception as ex: logger.warning("Exception when attempting to load " "subdir '%s' as CASAVA/bcl2fastq output " "(ignored): %s" % (dirn, ex)) # Look for analysis data data = AnalysisProject(dirn, os.path.join(self._analysis_dir, dirn)) if data.is_analysis_dir: if dirn == 'undetermined': logger.debug("- %s: undetermined indexes" % dirn) self.undetermined = data else: # Check against projects.info, if possible try: if not self.projects_metadata.lookup('Project', dirn): logger.debug("- %s: not in projects.info" % dirn) self._extra_dirs.append(dirn) continue except AttributeError: pass logger.debug("- %s: project directory" % dirn) self._project_dirs.append(dirn) self.projects.append(data) continue else: # Unidentified contents self._extra_dirs.append(dirn) logger.debug("- %s: unknown" % dirn)
def fastq_statistics(ap, stats_file=None, per_lane_stats_file=None, unaligned_dir=None, sample_sheet=None, add_data=False, nprocessors=None, runner=None): """Generate statistics for Fastq files Generates statistics for all Fastq files found in the 'unaligned' directory, by running the 'fastq_statistics.py' program. Arguments ap (AutoProcessor): autoprocessor pointing to the analysis directory to create Fastqs for stats_file (str): path of a non-default file to write the statistics to (defaults to 'statistics.info' unless over-ridden by local settings) per_lane_stats_file (str): path for per-lane statistics output file (defaults to 'per_lane_statistics.info' unless over-ridden by local settings) unaligned_dir (str): output directory for bcl-to-fastq conversion sample_sheet (str): path to sample sheet file used in bcl-to-fastq conversion add_data (bool): if True then add stats to the existing stats files (default is to overwrite existing stats files) nprocessors (int): number of cores to use when running 'fastq_statistics.py' runner (JobRunner): (optional) specify a non-default job runner to use for running 'fastq_statistics.py' """ # Get file names for output files if stats_file is None: if ap.params['stats_file'] is not None: stats_file = ap.params['stats_file'] else: stats_file = 'statistics.info' if per_lane_stats_file is None: if ap.params['per_lane_stats_file'] is not None: per_lane_stats_file = ap.params['per_lane_stats_file'] else: per_lane_stats_file = 'per_lane_statistics.info' # Sort out unaligned_dir if unaligned_dir is None: if ap.params.unaligned_dir is None: ap.params['unaligned_dir'] = 'bcl2fastq' unaligned_dir = ap.params.unaligned_dir if not os.path.exists(os.path.join(ap.params.analysis_dir, unaligned_dir)): logger.error("Unaligned dir '%s' not found" % unaligned_dir) # Check for sample sheet if sample_sheet is None: sample_sheet = ap.params['sample_sheet'] # Check if any Fastqs are newer than stats files newest_mtime = 0 for f in ( stats_file, per_lane_stats_file, ): try: newest_mtime = max(newest_mtime, os.path.getmtime(f)) except OSError: # Missing file newest_mtime = 0 break illumina_data = IlluminaData.IlluminaData(ap.params.analysis_dir, unaligned_dir) if newest_mtime > 0: regenerate_stats = False for project in illumina_data.projects: for sample in project.samples: for fq in sample.fastq: if (os.path.getmtime(os.path.join(sample.dirn, fq)) > newest_mtime): regenerate_stats = True break if regenerate_stats: logger.warning("Fastqs are newer than stats files") else: # Don't rerun the stats, just regenerate the report logger.warning("Stats files are newer than Fastqs") processing_qc_html = os.path.join(ap.analysis_dir, "processing_qc.html") report_processing_qc(ap, processing_qc_html) return # Set up runner if runner is None: runner = ap.settings.runners.stats runner.set_log_dir(ap.log_dir) # Number of cores if nprocessors is None: nprocessors = ap.settings.fastq_stats.nprocessors # Generate statistics fastq_statistics_cmd = Command( 'fastq_statistics.py', '--unaligned', unaligned_dir, '--sample-sheet', sample_sheet, '--output', os.path.join(ap.params.analysis_dir, stats_file), '--per-lane-stats', os.path.join(ap.params.analysis_dir, per_lane_stats_file), ap.params.analysis_dir, '--nprocessors', nprocessors) if add_data: fastq_statistics_cmd.add_args('--update') print "Generating statistics: running %s" % fastq_statistics_cmd fastq_statistics_job = SchedulerJob(runner, fastq_statistics_cmd.command_line, name='fastq_statistics', working_dir=ap.analysis_dir) fastq_statistics_job.start() try: fastq_statistics_job.wait( poll_interval=ap.settings.general.poll_interval) except KeyboardInterrupt as ex: logger.warning("Keyboard interrupt, terminating fastq_statistics") fastq_statistics_job.terminate() raise ex exit_code = fastq_statistics_job.exit_code print "fastq_statistics completed: exit code %s" % exit_code if exit_code != 0: raise Exception("fastq_statistics exited with an error") ap.params['stats_file'] = stats_file ap.params['per_lane_stats_file'] = per_lane_stats_file print "Statistics generation completed: %s" % ap.params.stats_file print "Generating processing QC report" processing_qc_html = os.path.join(ap.analysis_dir, "processing_qc.html") report_processing_qc(ap, processing_qc_html)
def make_fastqs(ap, protocol='standard', platform=None, unaligned_dir=None, sample_sheet=None, lanes=None, ignore_missing_bcl=False, ignore_missing_stats=False, skip_rsync=False, remove_primary_data=False, nprocessors=None, require_bcl2fastq_version=None, bases_mask=None, no_lane_splitting=None, minimum_trimmed_read_length=None, mask_short_adapter_reads=None, generate_stats=True, stats_file=None, per_lane_stats_file=None, analyse_barcodes=True, barcode_analysis_dir=None, skip_fastq_generation=False, only_fetch_primary_data=False, create_empty_fastqs=None, runner=None, cellranger_jobmode=None, cellranger_mempercore=None, cellranger_maxjobs=None, cellranger_jobinterval=None, cellranger_localcores=None, cellranger_localmem=None, cellranger_ignore_dual_index=False): """Create and summarise FASTQ files Wrapper for operations related to FASTQ file generation and analysis. The operations are typically: - get primary data (BCL files) - run bcl-to-fastq conversion - generate statistics If the number of processors and the job runner are not explicitly specified then these are taken from the settings for the bcl2fastq and the statistics generation steps, which may differ from each other. However if either of these values are set explicitly then the same values will be used for both steps. Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to create Fastqs for protocol (str): if set then specifies the protocol to use for fastq generation, otherwise use the 'standard' bcl2fastq protocol platform (str): if set then specifies the sequencing platform (otherwise platform will be determined from the primary data) unaligned_dir (str): if set then use this as the output directory for bcl-to-fastq conversion. Default is 'bcl2fastq' (unless an alternative is already specified in the config file) sample_sheet (str): if set then use this as the input samplesheet lanes (list): (optional) specify a list of lane numbers to use in the processing; lanes not in the list will be excluded (default is to include all lanes) nprocessors (int) : number of processors to run bclToFastq.py with ignore_missing_bcl (bool): if True then run bcl2fastq with --ignore-missing-bcl ignore_missing_stats (bool): if True then run bcl2fastq with --ignore-missing-stats skip_rsync (bool): if True then don't rsync primary data at the start of bcl2fastq conversion remove_primary_data (bool): if True then remove primary data at the end of bcl2fastq conversion (default is to keep it) generate_stats (bool): if True then (re)generate statistics file for fastqs analyse_barcodes (bool): if True then (re)analyse barcodes for fastqs require_bcl2fastq_version (str): (optional) specify bcl2fastq version to use. Should be a string of the form '1.8.4' or '>2.0'. Set to None to automatically determine required bcl2fastq version. bases_mask (str): if set then use this as an alternative bases mask setting no_lane_splitting (bool): if True then run bcl2fastq with --no-lane-splitting minimum_trimmed_read_length (int): if set then specify minimum length for reads after adapter trimming (shorter reads will be padded with Ns to make them long enough) mask_short_adapter_reads (int): if set then specify the minimum length of ACGT bases that must be present in a read after adapter trimming for it not to be masked completely with Ns. stats_file (str): if set then use this as the name of the output per-fastq stats file. per_lane_stats_file (str): if set then use this as the name of the output per-lane stats file. barcode_analysis_dir (str): if set then specifies path to the output directory for barcode analysis skip_fastq_generation (bool): if True then don't perform fastq generation only_fetch_primary_data (bool): if True then fetch primary data, don't do anything else create_empty_fastqs (bool): if True then create empty 'placeholder' fastq files for any missing fastqs after bcl2fastq (must have completed with zero exit status) runner (JobRunner): (optional) specify a non-default job runner to use for fastq generation cellranger_jobmode (str): (optional) job mode to run cellranger in (10xGenomics Chromium SC data only) cellranger_mempercore (int): (optional) memory assumed per core (in Gbs) (10xGenomics Chromium SC data only) cellranger_maxjobs (int): (optional) maxiumum number of concurrent jobs to run (10xGenomics Chromium SC data only) cellranger_jobinterval (int): (optional) how often jobs are submitted (in ms) (10xGenomics Chromium SC data only) cellranger_localcores (int): (optional) maximum number of cores cellranger can request in jobmode 'local' (10xGenomics Chromium SC data only) cellranger_localmem (int): (optional) maximum memory cellranger can request in jobmode 'local' (10xGenomics Chromium SC data only) cellranger_ignore_dual_index (bool): (optional) on a dual-indexed flowcell where the second index was not used for the 10x sample, ignore it (10xGenomics Chromium SC data only) """ # Report protocol print "Protocol : %s" % protocol if protocol not in MAKE_FASTQS_PROTOCOLS: raise Exception("Unknown protocol: '%s' (must be one of " "%s)" % (protocol, ','.join([MAKE_FASTQS_PROTOCOLS]))) # Unaligned dir if unaligned_dir is not None: ap.params['unaligned_dir'] = unaligned_dir elif ap.params['unaligned_dir'] is None: ap.params['unaligned_dir'] = 'bcl2fastq' print "Output dir : %s" % ap.params.unaligned_dir # Sample sheet if sample_sheet is None: sample_sheet = ap.params.sample_sheet if not os.path.isabs(sample_sheet): sample_sheet = os.path.join(ap.analysis_dir, sample_sheet) if not os.path.isfile(sample_sheet): raise Exception("Missing sample sheet '%s'" % sample_sheet) ap.params['sample_sheet'] = sample_sheet print "Source sample sheet : %s" % ap.params.sample_sheet # Check requested lanes are actually present print "Lanes : %s" % ('all' if lanes is None else ','.join( [str(l) for l in lanes])) if lanes is not None: s = IlluminaData.SampleSheet(ap.params.sample_sheet) if not s.has_lanes: raise Exception("Requested subset of lanes but " "samplesheet doesn't contain any " "lane information") samplesheet_lanes = list(set([l['Lane'] for l in s])) for l in lanes: if l not in samplesheet_lanes: raise Exception("Requested lane '%d' not present " "in samplesheet" % l) # Make a temporary sample sheet if lanes: lanes_id = ".L%s" % ''.join([str(l) for l in lanes]) else: lanes_id = "" sample_sheet = os.path.join( ap.tmp_dir, "SampleSheet%s.%s.csv" % (lanes_id, time.strftime("%Y%m%d%H%M%S"))) make_custom_sample_sheet(ap.params.sample_sheet, sample_sheet, lanes=lanes) # Check the temporary sample sheet print "Checking temporary sample sheet" invalid_barcodes = SampleSheetLinter( sample_sheet_file=sample_sheet).has_invalid_barcodes() if invalid_barcodes: logger.error("Invalid barcodes detected") for line in invalid_barcodes: logger.critical("%s" % line) invalid_characters = SampleSheetLinter( sample_sheet_file=sample_sheet).has_invalid_characters() if invalid_characters: logger.critical("Invalid non-printing/non-ASCII characters " "detected") if invalid_barcodes or invalid_characters: raise Exception("Errors detected in generated sample sheet") # Adjust verification settings for 10xGenomics Chromium SC # data if necessary verify_include_sample_dir = False if has_chromium_sc_indices(sample_sheet): if protocol in ( '10x_chromium_sc', '10x_chromium_sc_atac', ): # Force inclusion of sample-name subdirectories # when verifying Chromium SC data print "Sample sheet includes Chromium SC indices" verify_include_sample_dir = True else: # Chromium SC indices detected but not using # 10x_chromium_sc protocol raise Exception("Detected 10xGenomics Chromium SC indices " "in generated sample sheet but protocol " "'%s' has been specified; use an " "appropriate '10x_...' protocol for these " "indices" % protocol) # Check for pre-existing Fastq outputs if verify_fastq_generation(ap, unaligned_dir=ap.params.unaligned_dir, lanes=lanes, include_sample_dir=verify_include_sample_dir): print "Expected Fastq outputs already present" skip_rsync = True skip_fastq_generation = True # Check if there's anything to do if (skip_rsync and skip_fastq_generation) and \ not (generate_stats or analyse_barcodes): print "Nothing to do" return # Log dir log_dir = 'make_fastqs' if protocol != 'standard': log_dir += "_%s" % protocol if lanes: log_dir += "_L%s" % ''.join([str(l) for l in sorted(lanes)]) ap.set_log_dir(ap.get_log_subdir(log_dir)) # Fetch primary data if not skip_rsync and not ap.params.acquired_primary_data: if get_primary_data(ap) != 0: logger.error("Failed to acquire primary data") raise Exception("Failed to acquire primary data") else: ap.params['acquired_primary_data'] = True if only_fetch_primary_data: return # Deal with platform information if not platform: platform = ap.metadata.platform # Do fastq generation using the specified protocol if not skip_fastq_generation: # Set primary data location and report info primary_data_dir = os.path.join(ap.params.primary_data_dir, os.path.basename(ap.params.data_dir)) print "Primary data dir : %s" % primary_data_dir try: illumina_run = IlluminaData.IlluminaRun(primary_data_dir, platform=platform) except IlluminaData.IlluminaDataPlatformError as ex: logger.critical("Error loading primary data: %s" % ex) if platform is None: logger.critical("Try specifying platform using --platform?") else: logger.critical("Check specified platform is valid (or " "omit --platform") raise Exception("Error determining sequencer platform") print "Platform : %s" % illumina_run.platform print "Bcl format : %s" % illumina_run.bcl_extension # Set platform in metadata ap.metadata['platform'] = illumina_run.platform # Bases mask if bases_mask is not None: ap.params['bases_mask'] = bases_mask bases_mask = ap.params.bases_mask print "Bases mask setting : %s" % bases_mask if protocol not in ( '10x_chromium_sc', '10x_chromium_sc_atac', ): if bases_mask == "auto": print "Determining bases mask from RunInfo.xml" bases_mask = get_bases_mask(illumina_run.runinfo_xml, sample_sheet) if not bases_mask_is_valid(bases_mask): raise Exception("Invalid bases mask: '%s'" % bases_mask) # Do fastq generation according to protocol if protocol == 'icell8': # ICell8 data # Update bcl2fastq settings appropriately print "Updating read trimming and masking for ICell8" minimum_trimmed_read_length = 21 mask_short_adapter_reads = 0 # Reset the default bases mask bases_mask = IlluminaData.IlluminaRunInfo( illumina_run.runinfo_xml).bases_mask bases_mask = get_icell8_bases_mask(bases_mask, sample_sheet=sample_sheet) if not bases_mask_is_valid(bases_mask): raise Exception("Invalid bases mask: '%s'" % bases_mask) # Switch to standard protocol protocol = 'standard' if protocol == 'standard': # Standard protocol try: exit_code = bcl_to_fastq( ap, unaligned_dir=ap.params.unaligned_dir, sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, require_bcl2fastq=require_bcl2fastq_version, bases_mask=bases_mask, ignore_missing_bcl=ignore_missing_bcl, ignore_missing_stats=ignore_missing_stats, no_lane_splitting=no_lane_splitting, minimum_trimmed_read_length=minimum_trimmed_read_length, mask_short_adapter_reads=mask_short_adapter_reads, nprocessors=nprocessors, runner=runner) except Exception as ex: raise Exception("Bcl2fastq stage failed: '%s'" % ex) elif protocol == '10x_chromium_sc': # 10xGenomics Chromium SC if bases_mask == 'auto': bases_mask = None try: # Check we have cellranger cellranger = find_program('cellranger') if not cellranger: raise Exception("No cellranger package found") cellranger_software_info = cellranger_info(cellranger) print "Using cellranger %s: %s" % \ (cellranger_software_info[-1], cellranger) # Check we have bcl2fastq bcl2fastq = find_program('bcl2fastq') if not bcl2fastq: raise Exception("No bcl2fastq package found") bcl2fastq = available_bcl2fastq_versions( paths=(os.path.dirname(bcl2fastq), ), reqs='>=2.17') if not bcl2fastq: raise Exception("No appropriate bcl2fastq software " "located") bcl2fastq = bcl2fastq[0] bcl2fastq_info = bcl_to_fastq_info(bcl2fastq) print "Using bcl2fastq %s: %s" % (bcl2fastq_info[-1], bcl2fastq) # Store info on bcl2fastq package ap.metadata['bcl2fastq_software'] = bcl2fastq_info # Store info on cellranger package ap.metadata['cellranger_software'] = cellranger_software_info # Put a copy of sample sheet in the log directory shutil.copy(sample_sheet, ap.log_dir) # Determine output directory absolute path output_dir = ap.params.unaligned_dir if not os.path.isabs(output_dir): output_dir = os.path.join(ap.analysis_dir, output_dir) # Run cellranger mkfastq exit_code = run_cellranger_mkfastq( sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, output_dir=output_dir, lanes=(None if lanes is None else ','.join( [str(l) for l in lanes])), bases_mask=bases_mask, cellranger_exe=cellranger, cellranger_jobmode=cellranger_jobmode, cellranger_maxjobs=cellranger_maxjobs, cellranger_mempercore=cellranger_mempercore, cellranger_jobinterval=cellranger_jobinterval, cellranger_localcores=cellranger_localcores, cellranger_localmem=cellranger_localmem, working_dir=ap.analysis_dir, log_dir=ap.log_dir) except Exception as ex: raise Exception("'cellranger mkfastq' stage failed: " "'%s'" % ex) # Turn off barcode analysis analyse_barcodes = False elif protocol == '10x_chromium_sc_atac': # 10xGenomics Chromium scATAC-seq exit_code = bcl_to_fastq_10x_chromium_sc_atac( ap, output_dir=ap.params.unaligned_dir, sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, lanes=lanes, bases_mask=bases_mask, cellranger_jobmode=cellranger_jobmode, cellranger_maxjobs=cellranger_maxjobs, cellranger_mempercore=cellranger_mempercore, cellranger_jobinterval=cellranger_jobinterval, cellranger_localcores=cellranger_localcores, cellranger_localmem=cellranger_localmem, log_dir=ap.log_dir) # Turn off barcode analysis analyse_barcodes = False else: # Unknown protocol raise Exception("Unknown protocol '%s'" % protocol) # Check the outputs if exit_code != 0: raise Exception("Fastq generation finished with error: " "exit code %d" % exit_code) if not verify_fastq_generation( ap, lanes=lanes, include_sample_dir=verify_include_sample_dir): # Check failed logger.error("Failed to verify output Fastqs against " "sample sheet") # Try to load the data from unaligned dir try: illumina_data = IlluminaData.IlluminaData( ap.analysis_dir, unaligned_dir=ap.params.unaligned_dir) except IlluminaData.IlluminaDataError as ex: raise Exception("Unable to load data from %s: %s" % (ap.params.unaligned_dir, ex)) # Generate a list of missing Fastqs missing_fastqs = IlluminaData.list_missing_fastqs( illumina_data, sample_sheet, include_sample_dir=verify_include_sample_dir) assert (len(missing_fastqs) > 0) missing_fastqs_file = os.path.join(ap.log_dir, "missing_fastqs.log") print "Writing list of missing Fastq files to %s" % \ missing_fastqs_file with open(missing_fastqs_file, 'w') as fp: for fq in missing_fastqs: fp.write("%s\n" % fq) # Create empty FASTQs if create_empty_fastqs is None: try: create_empty_fastqs = \ ap.settings.platform[ap.metadata.platform].\ create_empty_fastqs except (KeyError, AttributeError): pass if create_empty_fastqs is None: create_empty_fastqs = \ ap.settings.bcl2fastq.create_empty_fastqs if create_empty_fastqs: logger.warning("Making 'empty' placeholder Fastqs") for fq in missing_fastqs: fastq = os.path.join(ap.analysis_dir, ap.params.unaligned_dir, fq) print "-- %s" % fastq if not os.path.exists(os.path.dirname(fastq)): mkdirs(os.path.dirname(fastq)) with gzip.GzipFile(filename=fastq, mode='wb') as fp: fp.write('') else: raise Exception("Fastq generation failed to produce " "expected outputs") # Generate statistics if generate_stats: fastq_statistics(ap, stats_file=stats_file, per_lane_stats_file=per_lane_stats_file, unaligned_dir=ap.params.unaligned_dir, nprocessors=nprocessors, runner=runner) # Run barcode analysis if analyse_barcodes: # Determine output directory if barcode_analysis_dir is not None: ap.params['barcode_analysis_dir'] = barcode_analysis_dir elif ap.params.barcode_analysis_dir is None: ap.params['barcode_analysis_dir'] = 'barcode_analysis' barcode_analysis_dir = ap.params.barcode_analysis_dir if not os.path.isabs(barcode_analysis_dir): barcode_analysis_dir = os.path.join(ap.params.analysis_dir, barcode_analysis_dir) # Report title title = "Barcode analysis for %s" % ap.metadata.run_name # Log file log_file = os.path.join(ap.log_dir, "analyse_barcodes.log") # Set up runner if runner is None: runner = ap.settings.general.default_runner runner.set_log_dir(ap.log_dir) # Get scheduler parameters max_jobs = ap.settings.general.max_concurrent_jobs poll_interval = ap.settings.general.poll_interval # Create and run barcode analysis pipeline barcode_analysis = AnalyseBarcodes( os.path.join(ap.params.analysis_dir, ap.params.unaligned_dir)) barcode_analysis.run(barcode_analysis_dir, title=title, lanes=lanes, sample_sheet=sample_sheet, log_file=log_file, runner=runner, max_jobs=max_jobs, poll_interval=poll_interval, verbose=False) # Make a 'projects.info' metadata file if lanes: ap.update_project_metadata_file() else: ap.make_project_metadata_file() # Remove primary data if remove_primary_data: remove_primary_data(ap)