def setup(self): # Check the QC protocol qc_info = self.args.project.qc_info(self.args.qc_dir) stored_protocol = qc_info.protocol if stored_protocol is not None and \ stored_protocol != self.args.qc_protocol: logger.warning("QC protocol mismatch for %s: " "'%s' stored, '%s' specified" % (self.project.name, stored_protocol, self.args.protocol)) logger.warning("Stored protocol will be ignored") # Set up QC dir if not os.path.exists(self.args.qc_dir): mkdir(self.args.qc_dir) # Set up log dir if self.args.log_dir is None: log_dir = os.path.join(self.args.qc_dir,'logs') else: log_dir = self.args.log_dir if not os.path.exists(log_dir): mkdir(log_dir) # Store the QC protocol data qc_info['protocol'] = self.args.qc_protocol qc_info['fastq_dir'] = self.args.project.fastq_dir qc_info.save()
def mkdir(newdir): """ Create a directory The new directory should be identified using a specifier of the form '[[USER@]HOST:]NEWDIR'. Arguments: newdir (str): location of the new directory (can be on local or remote system) """ newdir = Location(newdir) if not newdir.is_remote: # Local directory bcftbx_utils.mkdir(newdir.path) else: # Remote directory try: mkdir_cmd = applications.general.ssh_command( newdir.user, newdir.server, ('mkdir', newdir.path)) print "Running %s" % mkdir_cmd mkdir_cmd.run_subprocess() except Exception as ex: raise Exception("Exception making remote directory %s: %s" % (newdir, ex))
def create_directory(self, dirn): # Make the specified directory, and any leading directories # that don't already exist if not os.path.exists(dirn): dir_path = os.sep for sub_dir in dirn.split(os.sep): dir_path = os.path.join(dir_path, sub_dir) if not os.path.exists(dir_path): print("Making %s" % dir_path) bcf_utils.mkdir(dir_path)
def build_library_directory(analysis_dir,dest,projects=None): """ Build and populate data library directory on server Arguments: analysis_dir (AnalysisDir): analysis directory to export files from dest (str): location of top-level data library directory projects (list): list of projects to export (default is to export all projects) """ # Create and populate internal directory structure on server user,server,dirn = split_user_host_dir(dest) remote = (server is not None) path = os.path.join(dirn,analysis_dir.run_name) if remote: logging.critical("Dealing with remote systems not implemented") raise NotImplementedError("Cannot build library directory on remote system") run_path = os.path.join(dirn,analysis_dir.run_name) print "Creating %s" % run_path mkdir(run_path) for project in analysis_dir.get_projects( include_undetermined=False): if projects is not None and project.name not in projects: print "Ignoring project '%s'" % project.name continue project_path = os.path.join(run_path,project.name) print "Creating %s" % project_path mkdir(project_path) print "Populating with uncompressed Fastqs:" for sample in project.samples: for fq in sample.fastq: fqcp = os.path.join(project_path, os.path.basename(fq)) if fqcp.endswith('.gz'): fqcp = fqcp[0:-3] if os.path.exists(fqcp): print "-- found: %s" % fqcp continue print "-- %s" % fqcp with get_fastq_file_handle(fq) as fp: with open(fqcp,'wb') as fpcp: while True: data = fp.read(102400) if not data: break fpcp.write(data)
def build_library_directory(analysis_dir, dest, projects=None): """ Build and populate data library directory on server Arguments: analysis_dir (AnalysisDir): analysis directory to export files from dest (str): location of top-level data library directory projects (list): list of projects to export (default is to export all projects) """ # Create and populate internal directory structure on server user, server, dirn = split_user_host_dir(dest) remote = (server is not None) path = os.path.join(dirn, analysis_dir.run_name) if remote: logging.critical("Dealing with remote systems not implemented") raise NotImplementedError( "Cannot build library directory on remote system") run_path = os.path.join(dirn, analysis_dir.run_name) print "Creating %s" % run_path mkdir(run_path) for project in analysis_dir.get_projects(include_undetermined=False): if projects is not None and project.name not in projects: print "Ignoring project '%s'" % project.name continue project_path = os.path.join(run_path, project.name) print "Creating %s" % project_path mkdir(project_path) print "Populating with uncompressed Fastqs:" for sample in project.samples: for fq in sample.fastq: fqcp = os.path.join(project_path, os.path.basename(fq)) if fqcp.endswith('.gz'): fqcp = fqcp[0:-3] if os.path.exists(fqcp): print "-- found: %s" % fqcp continue print "-- %s" % fqcp with get_fastq_file_handle(fq) as fp: with open(fqcp, 'wb') as fpcp: while True: data = fp.read(102400) if not data: break fpcp.write(data)
# Check for underlying programs required = ["fastq_screen"] if args.aligner is not None: required.append(args.aligner) else: logging.warning("Aligner not specified, cannot check") for prog in required: if find_program(prog) is None: logging.critical("couldn't find '%s'" % prog) sys.exit(1) # Make output dir if args.out_dir is not None: out_dir = os.path.abspath(args.out_dir) mkdir(out_dir) else: out_dir = os.getcwd() # Screen against 'mammalian' genomes tagged_fastq = fastq_screen_tag(mammalian_conf, fqr2, aligner=args.aligner, threads=args.threads, out_dir=out_dir, tempdir=out_dir) mammalian_tagged_fq = strip_ext(tagged_fastq,'.fastq') + '.' + \ os.path.basename( strip_ext(mammalian_conf,'.conf')) + \ '.fastq' os.rename(tagged_fastq, mammalian_tagged_fq)
def create_analysis_dir(project, top_dir=None, merge_replicates=False, keep_names=False, dry_run=False): """Create and populate analysis directory for an IlluminaProject Creates a new directory and populates either with links to FASTQ files, or with 'merged' FASTQ files created by concatenating multiple FASTQs for each sample (which can happen for multiplexed runs where samples are split across multiple lanes). Project directory names are made up of the project name and then the experiment type, or just the project name if experiment type is not set. Arguments: project : populated IlluminaProject object top_dir : parent directory to create analysis subdirectory under. Defaults to cwd if not explicitly specified merge_replicates: if True then creates a single FASTQ file for each sample by merging multiple FASTQs together keep_names: if True then links to FASTQ files will have the same names as the original files; by default links use the shortest unique name dry_run : if True then report what would be done but don't actually perform any action Returns: Name of the project directory. """ project_dir = os.path.join(top_dir,project.full_name) print "Creating analysis directory for project '%s'..." % project.full_name # Check for & create directory if os.path.exists(project_dir): print "-> %s already exists" % project_dir else: print "Making analysis directory for %s" % project.name if not dry_run: bcf_utils.mkdir(project_dir,mode=0775) # Make an empty ScriptCode directory scriptcode_dir = os.path.join(project_dir,"ScriptCode") if os.path.exists(scriptcode_dir): print "'ScriptCode' directory %s already exists" % scriptcode_dir else: print "Making 'ScriptCode' directory for %s" % project.name if not dry_run: bcf_utils.mkdir(scriptcode_dir,mode=0775) # Check for & create links to fastq files if not merge_replicates: for sample in project.samples: fastq_names = IlluminaData.get_unique_fastq_names(sample.fastq) for fastq in sample.fastq: fastq_file = os.path.join(sample.dirn,fastq) if keep_names: fastq_ln = os.path.join(project_dir,fastq) else: fastq_ln = os.path.join(project_dir,fastq_names[fastq]) if os.path.exists(fastq_ln): logging.error("Failed to link to %s: %s already exists" % (fastq_file,os.path.basename(fastq_ln))) else: print "Linking to %s" % fastq if not dry_run: bcf_utils.mklink(fastq_file,fastq_ln,relative=True) else: # Merge files for replicates within each sample for sample in project.samples: replicates = {} # Gather replicates to be merged for fastq in sample.fastq: fastq_data = IlluminaData.IlluminaFastq(fastq) name = "%s_%s_R%d" % (fastq_data.sample_name, fastq_data.barcode_sequence, fastq_data.read_number) if name not in replicates: replicates[name] = [] replicates[name].append(os.path.join(sample.dirn,fastq)) # Sort into order replicates[name].sort() # Report detected replicates print "Sample %s" % sample.name for name in replicates: print "\tReplicate '%s'" % name for fastq in replicates[name]: print "\t\t%s" % fastq # Do the merge for name in replicates: merged_fastq = os.path.join(project_dir,name+'.fastq') bcf_utils.concatenate_fastq_files(merged_fastq,replicates[name]) # Return directory name return project_dir
def clone(ap, clone_dir, copy_fastqs=False, exclude_projects=False): """ Make a 'clone' (i.e. copy) of an analysis directory Makes a functional copy of an existing analysis directory, including metadata and parameters, stats files, processing reports and project subdirectories. By default the 'unaligned' directory in the new directory is simply a symlink from the original directory; set the 'copy_fastqs' to make copies instead. Arguments ap (AutoProcessor): autoprocessor pointing to the parent analysis directory clone_dir (str): path to the new directory to create as a clone (must not already exist). copy_fastqs (boolean): set to True to copy the Fastq files (otherwise default behaviour is to make symlinks) exclude_projects (boolean): set to True to exclude any projects from the parent analysis directory """ clone_dir = os.path.abspath(clone_dir) print "Cloning into %s" % clone_dir if os.path.exists(clone_dir): # Directory already exists logger.critical("Target directory '%s' already exists" % clone_dir) raise Exception("Clone failed: target directory '%s' " "already exists" % clone_dir) bcf_utils.mkdir(clone_dir) # Copy metadata and parameters for f in (ap.metadata_file, ap.parameter_file): if os.path.exists(f): shutil.copy(f, os.path.join(clone_dir, os.path.basename(f))) # Primary data directory if ap.params.primary_data_dir: primary_data_dir = os.path.join(ap.analysis_dir, ap.params.primary_data_dir) if os.path.isdir(primary_data_dir): clone_primary_data_dir = os.path.join( clone_dir, os.path.basename(primary_data_dir)) print "[Primary data] making %s" % clone_primary_data_dir bcf_utils.mkdir(clone_primary_data_dir) data_dir = os.path.basename(ap.params.data_dir) if os.path.exists(os.path.join(primary_data_dir, data_dir)): clone_data_dir = os.path.join(clone_primary_data_dir, data_dir) print "[Primary data] symlinking %s" % clone_data_dir os.symlink(os.path.join(primary_data_dir, data_dir), clone_data_dir) # Link to or copy fastqs if not ap.params.unaligned_dir: for d in ( 'Unaligned', 'bcl2fastq', ): unaligned_dir = os.path.join(ap.analysis_dir, d) if os.path.isdir(unaligned_dir): break unaligned_dir = None else: unaligned_dir = os.path.join(ap.analysis_dir, ap.params.unaligned_dir) if os.path.isdir(unaligned_dir): clone_unaligned_dir = os.path.join(clone_dir, os.path.basename(unaligned_dir)) if not copy_fastqs: # Link to unaligned dir print "[Unaligned] symlinking %s" % clone_unaligned_dir os.symlink(unaligned_dir, clone_unaligned_dir) else: # Copy unaligned dir print "[Unaligned] copying %s" % clone_unaligned_dir shutil.copytree(unaligned_dir, clone_unaligned_dir) else: print "[Unaligned] no 'unaligned' dir found" # Duplicate project directories projects = ap.get_analysis_projects() if projects and not exclude_projects: for project in ap.get_analysis_projects(): print "[Projects] duplicating project '%s'" % project.name fastqs = project.fastqs new_project = AnalysisProject( project.name, os.path.join(clone_dir, project.name), user=project.info.user, PI=project.info.PI, library_type=project.info.library_type, single_cell_platform=project.info.single_cell_platform, organism=project.info.organism, run=project.info.run, comments=project.info.comments, platform=project.info.platform) new_project.create_directory(fastqs=fastqs, link_to_fastqs=(not copy_fastqs)) # Copy additional files, if found for f in ( "SampleSheet.orig.csv", ("custom_SampleSheet.csv" if not ap.params.sample_sheet else ap.params.sample_sheet), ("projects.info" if not ap.params.project_metadata else ap.params.project_metadata), ("statistics.info" if not ap.params.stats_file else ap.params.stats_file), ("per_lane_statistics.info" if not ap.params.per_lane_stats_file else ap.params.per_lane_stats_file), "statistics_full.info", "per_lane_sample_stats.info", "processing_qc.html", ): if not f: continue srcpath = os.path.join(ap.analysis_dir, f) if os.path.exists(srcpath): print "[Files] copying %s" % f shutil.copy(srcpath, clone_dir) # Create the basic set of subdirectories for subdir in ( 'logs', 'ScriptCode', ): print "[Subdirectories] making %s" % subdir bcf_utils.mkdir(os.path.join(clone_dir, subdir)) # Update the settings parameter_file = os.path.join(clone_dir, os.path.basename(ap.parameter_file)) params = AnalysisDirParameters( filen=os.path.join(clone_dir, os.path.basename(ap.parameter_file))) for p in ("sample_sheet", "primary_data_dir"): if not params[p]: continue print "[Parameters] updating '%s'" % p params[p] = os.path.join(clone_dir, os.path.relpath(params[p], ap.analysis_dir)) params.save()
sys.exit(1) # Make top-level output dirs icell8_dir = os.path.abspath(outdir) if os.path.exists(icell8_dir) and args.project is None: if not args.force: logger.fatal("Output destination '%s': already exists " "(remove or use --force to overwrite)" % icell8_dir) sys.exit(1) logger.warning("Removing existing output destination '%s'" % icell8_dir) shutil.rmtree(icell8_dir) log_dir = os.path.join(icell8_dir, "logs") scripts_dir = os.path.join(icell8_dir, "scripts") for dirn in (icell8_dir, log_dir, scripts_dir): mkdir(dirn) # Copy well list file into output directory shutil.copy(well_list, outdir) well_list = os.path.join(outdir, os.path.basename(well_list)) if analysis_project is not None: analysis_project.info['icell8_well_list'] = os.path.basename(well_list) analysis_project.info.save() # Set up pipelines pipelines = [] # ICELL QC and filtering print "Setting up a pipeline for ICELL processing" pipelines.append( ICell8QCFilter(outdir,
def main(): # Handle the command line p = argparse.ArgumentParser() p.add_argument("fastqs", nargs='*', metavar="FASTQ_R1 FASTQ_R2", help="FASTQ file pairs") p.add_argument("-w", "--well-list", dest="well_list_file", default=None, help="iCell8 'well list' file") p.add_argument("-m", "--mode", dest="splitting_mode", default="barcodes", choices=["barcodes", "batch", "none"], help="how to split the input FASTQs: 'barcodes' " "(one FASTQ pair per barcode), 'batch' (one or " "more FASTQ pairs with fixed number of reads not " "exceeding BATCH_SIZE), or 'none' (output all " "reads to a single FASTQ pair) (default: " "'barcodes')") p.add_argument("-s", "--size", type=int, dest="batch_size", default=DEFAULT_BATCH_SIZE, help="number of reads per batch in 'batch' mode " "(default: %d)" % DEFAULT_BATCH_SIZE) p.add_argument("-b", "--basename", default="icell8", help="basename for output FASTQ files (default: " "'icell8')") p.add_argument("-o", "--outdir", dest="out_dir", default=None, help="directory to write output FASTQ files to " "(default: current directory)") p.add_argument("-d", "--discard-unknown-barcodes", dest='discard_unknown_barcodes', action='store_true', help="discard reads with barcodes which don't " "match any of those in the WELL_LIST_FILE " "(default: keep all reads)") p.add_argument("-q", "--quality-filter", dest='quality_filter', action='store_true', help="filter reads by barcode and UMI quality " "(default: don't filter reads on quality)") p.add_argument("-c", "--compress", action='store_true', help="output compressed .gz FASTQ files") args = p.parse_args() # Convert quality cutoffs to character encoding barcode_quality_cutoff = chr(INLINE_BARCODE_QUALITY_CUTOFF + 33) umi_quality_cutoff = chr(UMI_QUALITY_CUTOFF + 33) # Get well list and expected barcodes well_list_file = args.well_list_file if well_list_file is not None: well_list_file = os.path.abspath(args.well_list_file) well_list = ICell8WellList(well_list_file) expected_barcodes = set(well_list.barcodes()) print "%d expected barcodes" % len(expected_barcodes) # Filtering on barcode do_check_barcodes = args.discard_unknown_barcodes if do_check_barcodes and well_list_file is None: logging.fatal("-d/--discard-unknown-barcodes: need to supply a " "well list file") sys.exit(1) # Filter on barcode and UMI quality do_quality_filter = args.quality_filter # Splitting mode splitting_mode = args.splitting_mode batch_size = args.batch_size # Count barcodes and rejections assigned = 0 unassigned = 0 filtered = 0 barcode_list = set() filtered_counts = {} # Input Fastqs fastqs = pair_fastqs([fq for fq in args.fastqs])[0] # Output Fastqs output_fqs = BufferedOutputFiles(base_dir=args.out_dir) if args.out_dir is not None: out_dir = os.path.abspath(args.out_dir) mkdir(out_dir) else: out_dir = os.getcwd() basename = args.basename # Compress outputs? if args.compress: fastq_ext = "fastq.gz" else: fastq_ext = "fastq" # Iterate over pairs of Fastqs for fastq_pair in fastqs: # Iterate over read pairs from the Fastqs print "-- %s\n %s" % fastq_pair print " Starting at %s" % time.ctime() start_time = time.time() for i, read_pair in enumerate(ICell8FastqIterator(*fastq_pair), start=1): # Deal with read pair if (i % 100000) == 0: print " Examining read pair #%d (%s)" % \ (i,time.ctime()) inline_barcode = read_pair.barcode barcode_list.add(inline_barcode) # Initial assignment assign_to = inline_barcode # Apply quality filtering if do_quality_filter: if not pass_quality_filter(read_pair.barcode_quality, barcode_quality_cutoff): assign_to = "failed_barcode" elif not pass_quality_filter(read_pair.umi_quality, umi_quality_cutoff): assign_to = "failed_umi" else: filtered += 1 # Check barcode is valid if do_check_barcodes: if inline_barcode not in expected_barcodes: assign_to = "unassigned" unassigned += 1 else: assigned += 1 logging.debug("%s" % '\t'.join([ assign_to, inline_barcode, read_pair.umi, read_pair.min_barcode_quality, read_pair.min_umi_quality ])) # Post filtering counts if assign_to == inline_barcode: try: filtered_counts[inline_barcode] += 1 except KeyError: filtered_counts[inline_barcode] = 1 # Reassign read pair to appropriate output files if splitting_mode == "batch": # Output to a batch-specific file pair batch_number = filtered / batch_size assign_to = "B%03d" % batch_number elif splitting_mode == "none": # Output to a single file pair assign_to = "filtered" # Write read pair fq_r1 = "%s_R1" % assign_to fq_r2 = "%s_R2" % assign_to if fq_r1 not in output_fqs: try: # Try to reopen file and append output_fqs.open(fq_r1, append=True) except KeyError: # Open new file output_fqs.open( fq_r1, "%s.%s.r1.%s" % (basename, assign_to, fastq_ext)) output_fqs.write(fq_r1, "%s" % read_pair.r1) if fq_r2 not in output_fqs: try: # Try to reopen file and append output_fqs.open(fq_r2, append=True) except KeyError: # Open new file output_fqs.open( fq_r2, "%s.%s.r2.%s" % (basename, assign_to, fastq_ext)) output_fqs.write(fq_r2, "%s" % read_pair.r2) print " Finished at %s" % time.ctime() print " (Took %.0fs)" % (time.time() - start_time) # Close output files output_fqs.close() # Summary output to screen total_reads = assigned + unassigned print "Summary:" print "--------" print "Number of barcodes : %d" % len(barcode_list) if do_check_barcodes: print "Number of expected barcodes: %d/%d" % \ (len(filtered_counts.keys()), len(expected_barcodes)) print "Total reads : %d" % total_reads if do_quality_filter: print "Total reads (filtered) : %d" % filtered if do_check_barcodes: print "Total reads (assigned) : %d" % assigned print "Unassigned reads : %d" % unassigned
if args.sample_pattern is not None: samples = project.get_samples(args.sample_pattern) else: samples = project.samples if not samples: logger.warning("No samples specified for QC, quitting") sys.exit() print "%d samples matched" % len(samples) for sample in samples: print "-- %s" % sample.name # Set up QC dir qc_dir = project.setup_qc_dir(qc_dir=args.qc_dir) print "QC output dir: %s" % qc_dir log_dir = os.path.join(qc_dir,'logs') mkdir(log_dir) qc_base = os.path.basename(qc_dir) # Output file name if args.filename is None: out_file = '%s_report.html' % qc_base else: out_file = args.filename if not os.path.isabs(out_file): out_file = os.path.join(project.dirn,out_file) print "QC report: %s" % out_file # Run the QC announce("Running QC") max_jobs = __settings.general.max_concurrent_jobs sched = SimpleScheduler(runner=qc_runner,
def merge_fastq_dirs(ap, primary_unaligned_dir, output_dir=None, dry_run=False): """ Combine multiple 'unaligned' output directories into one This method combines the output from multiple runs of CASAVA/bcl2fastq into a single 'unaligned'-equivalent directory. Currently it operates in an automatic mode and should detect additional 'unaligned' dirs on its own. Arguments: ap (AutoProcessor): autoprocessor pointing to the parent analysis directory primary_unaligned_dir (str): the 'unaligned' dir that data from from all others will be put into (relative path), unless overridden by 'output_dir' argument output_dir (str): optional, new 'unaligned' dir that will be created to hold merged data (relative path, defaults to 'primary_unaligned_dir') dry_run (boolean): if True then just report operations that would have been performed. """ if primary_unaligned_dir is None: raise Exception("Primary unaligned dir not defined") # Output directory if output_dir is None: output_dir = primary_unaligned_dir print("Fastqs will be merged into '%s'" % output_dir) # Collect unaligned dirs print("Collecting bcl2fastq directories") primary_illumina_data = None unaligned_dirs = {} for dirn in list_dirs(ap.analysis_dir): try: illumina_data = IlluminaData.IlluminaData(ap.analysis_dir, unaligned_dir=dirn) if dirn == primary_unaligned_dir: print("* %s (primary dir)" % dirn) primary_illumina_data = illumina_data elif dirn.endswith(".bak") or dirn.startswith("save."): print("Ignoring %s" % dirn) else: print("* %s" % dirn) unaligned_dirs[dirn] = illumina_data except Exception as ex: logger.debug("Rejecting %s: %s" % (dirn, ex)) # Check primary unaligned dir if primary_illumina_data is None: raise Exception("Primary dir '%s' doesn't exist, or doesn't " "contain data?" % primary_unaligned_dir) # Is there anything to do? if not unaligned_dirs: print("No extra bcl2fastq output directories found, nothing to do") return 0 # Make log directory and set up scheduler (if not dry run) if not dry_run: ap.set_log_dir(ap.get_log_subdir('merge_fastq_dirs')) runner = ap.settings.general.default_runner runner.set_log_dir(ap.log_dir) sched = SimpleScheduler( runner=runner, max_concurrent=ap.settings.general.max_concurrent_jobs, poll_interval=ap.settings.general.poll_interval) sched.start() jobs = [] # Top-level for undetermined reads if primary_illumina_data.undetermined.dirn != \ primary_illumina_data.unaligned_dir: undetermined_dir = os.path.basename( primary_illumina_data.undetermined.dirn) else: undetermined_dir = None # Do sanity checks before proceeding print("Checking primary data directory") fmt = primary_illumina_data.format paired_end = primary_illumina_data.paired_end no_lane_splitting = (len(primary_illumina_data.lanes) == 1) \ and (primary_illumina_data.lanes[0] is None) print("* Format: %s" % fmt) print("* no-lane-splitting: %s" % ('yes' if no_lane_splitting else 'no')) print("* paired-end: %s" % ('yes' if paired_end else 'no')) print("* undetermined dir: %s" % undetermined_dir) consistent_data = True for unaligned_dir in unaligned_dirs: illumina_data = unaligned_dirs[unaligned_dir] fmt0 = illumina_data.format no_lane_splitting0 = (len(illumina_data.lanes) == 1) \ and (primary_illumina_data.lanes[0] is None) if (fmt0 != fmt) or (no_lane_splitting0 != no_lane_splitting): print("!!! %s: inconsistent format to primary data dir !!!" % unaligned_dir) consistent_data = False if not consistent_data: raise Exception("Data directories not consistent with primary " "dir '%s'" % primary_unaligned_dir) # Collect the projects from the extra directories projects = [] undetermined = [] for unaligned_dir in unaligned_dirs: print("Examining projects in %s:" % unaligned_dir) illumina_data = unaligned_dirs[unaligned_dir] for project in illumina_data.projects: if not list(filter(lambda p: p.name == project.name, projects)): print("- %s: will be merged in" % project.name) projects.append(project) else: raise Exception("collision: %s already exists" % project.name) # Deal with undetermined reads if illumina_data.undetermined is not None: print("Examining undetermined samples:") if no_lane_splitting: # No lane info: should merge undetermined fastqs for sample in illumina_data.undetermined.samples: print("- %s: reads will be concatenated" % sample.name) undetermined.append(sample) else: for sample in illumina_data.undetermined.samples: if not list( filter(lambda s: s.name == sample.name, undetermined)): print("- %s: will be merged in" % sample.name) undetermined.append(sample) else: raise Exception("collision: %s already exists" % sample.name) else: print("No undetermined samples") # Collect any remaining projects from the primary # unaligned directory print("Examining projects in primary dir %s:" % primary_unaligned_dir) for project in primary_illumina_data.projects: if not list(filter(lambda p: p.name == project.name, projects)): print("- %s: will be merged in" % project.name) projects.append(project) else: print("- %s: already exists, will be discarded" % project.name) # Sort out the undetermined reads print("Examining undetermined samples:") if no_lane_splitting: # No lane info: should merge undetermined fastqs for sample in primary_illumina_data.undetermined.samples: print("- %s: reads will be concatenated" % sample.name) undetermined.insert(0, sample) else: for sample in primary_illumina_data.undetermined.samples: if not list(filter(lambda s: s.name == sample.name, undetermined)): print("- %s: will be merged in" % sample.name) undetermined.insert(0, sample) else: print("- %s: already exists, will be discarded" % sample.name) # Make a new directory for the merging merge_dir = os.path.join(ap.analysis_dir, output_dir + ".new") if undetermined_dir is not None: merge_undetermined_dir = os.path.join(merge_dir, undetermined_dir) else: merge_undetermined_dir = merge_dir if not dry_run: print("Making temporary merge directory %s" % merge_dir) mkdir(merge_dir) if not os.path.exists(merge_undetermined_dir): print("Making directory for undetermined %s" % merge_undetermined_dir) mkdir(merge_undetermined_dir) # Copy the projects print("Importing projects:") for project in projects: print("- %s" % project.name) project_dir = os.path.join(merge_dir, os.path.basename(project.dirn)) cmd = copytree_command(project.dirn, project_dir) print("- Running %s" % cmd) if not dry_run: job = sched.submit(cmd, name="copy_project.%s" % project.name, wd=merge_dir) print("Job: %s" % job) jobs.append(job) # Handle the undetermined reads print("Dealing with undetermined reads:") if no_lane_splitting: # No lane info: merge undetermined fastqs if len(undetermined) == 1: # Only one undetermined sample - copy Fastqs for read in (1, 2): if read == 2 and not paired_end: break fastqs = sample.fastq_subset(read_number=read, full_path=True) for fq in fastqs: cmd = copy_command(fq, merge_undetermined_dir) print("- Running %s" % cmd) if not dry_run: job = sched.submit(cmd, name="copy_undetermined.R%s" % read, wd=merge_dir) print("Job: %s" % job) jobs.append(job) else: # Multiple undetermined samples - concat Fastqs for read in (1, 2): if read == 2 and not paired_end: break cmd = Command('concat_fastqs.py') for sample in undetermined: fastqs = sample.fastq_subset(read_number=read, full_path=True) cmd.add_args(*fastqs) cmd.add_args( os.path.join(merge_undetermined_dir, "Undetermined_S0_R%s_001.fastq.gz" % read)) print("- Running %s" % cmd) if not dry_run: job = sched.submit(cmd, name="merge_undetermined.R%s" % read, wd=merge_dir) print("Job: %s" % job) jobs.append(job) else: for sample in undetermined: print("- %s" % sample.name) if fmt == "bcl2fastq2": # Hardlink copy fastqs directly sample_dir = merge_undetermined_dir if not dry_run: for fq in sample.fastq: src_fq = os.path.join(sample.dirn, fq) dst_fq = os.path.join(sample_dir, fq) os.link(src_fq, dst_fq) else: # Just copy directory tree wholesale sample_dir = os.path.join(merge_undetermined_dir, os.path.basename(sample.dirn)) cmd = copytree_command(sample.dirn, sample_dir) print("- Running %s" % cmd) if not dry_run: job = sched.submit(cmd, name="copy_sample_dir.%s" % sample.name, wd=merge_dir) print("Job: %s" % job.name) jobs.append(job) # Make expected subdirs for bcl2fastq2 if not dry_run and fmt == "bcl2fastq2": for dirn in ('Reports', 'Stats'): mkdir(os.path.join(merge_dir, dirn)) # Add a hidden placeholder to preserve these directories # on rsync -m (prune empty dirs) with open(os.path.join(merge_dir, dirn, '.placeholder'), 'w') as fp: fp.write("") # Wait for scheduler jobs to complete if not dry_run: sched.wait() sched.stop() # Check job exit status exit_status = 0 for j in jobs: exit_status += j.exit_status if j.exit_status != 0: logger.warning("Job failed: %s" % j) if exit_status: logger.critical("One or more jobs failed (non-zero " "exit status)") return exit_status # Move all the 'old' directories out of the way all_unaligned = [u for u in unaligned_dirs] all_unaligned.append(primary_unaligned_dir) for unaligned_dir in all_unaligned: unaligned_backup = os.path.join(ap.analysis_dir, "save.%s" % unaligned_dir) print("Moving %s to %s" % (unaligned_dir, unaligned_backup)) if not dry_run: shutil.move(os.path.join(ap.analysis_dir, unaligned_dir), unaligned_backup) # Rename the merged directory print("Renaming %s to %s" % (merge_dir, output_dir)) if not dry_run: shutil.move(merge_dir, os.path.join(ap.analysis_dir, output_dir)) # Reset the bcl2fastq dir if not dry_run: ap.params['unaligned_dir'] = output_dir # Make a new 'projects.info' metadata file project_metadata_file = os.path.join(ap.analysis_dir, 'projects.info') if os.path.exists(project_metadata_file): print("Moving existing projects.info file out of the way") if not dry_run: os.rename(project_metadata_file, os.path.join(ap.analysis_dir, 'save.projects.info')) print("Creating new projects.info file") if not dry_run: ap.make_project_metadata_file() return 0
def create_directory(self, illumina_project=None, fastqs=None, fastq_dir=None, short_fastq_names=False, link_to_fastqs=False): """Create and populate analysis directory for an IlluminaProject Creates a new directory corresponding to the AnalysisProject object, and optionally also populates with links to FASTQ files from a supplied IlluminaProject object. The directory structure it creates is: dir/ fastqs/ logs/ ScriptCode/ It also creates an info file with metadata about the project. Arguments: illumina_project: (optional) populated IlluminaProject object from which the analysis directory will be populated fastqs: (optional) list of fastq files to import fastq_dir: (optional) name of subdirectory to put fastq files into; defaults to 'fastqs' short_fastq_names: (optional) if True then transform fastq file names to be the shortest possible unique names; if False (default) then use the original fastq names link_to_fastqs: (optional) if True then make symbolic links to to the fastq files; if False (default) then make hard links """ logger.debug("Creating analysis directory for project '%s'" % self.name) # Check for & create directory if os.path.exists(self.dirn): logger.warning("Directory %s already exists" % self.dirn) else: logger.debug("Making analysis directory %s" % self.dirn) bcf_utils.mkdir(self.dirn, mode=0775) # Make a 'ScriptCode' directory scriptcode_dir = os.path.join(self.dirn, "ScriptCode") bcf_utils.mkdir(scriptcode_dir, mode=0775) # Put a file in ScriptCode to make sure it's # not pruned on subsequent rsync operations fp = open(os.path.join(self.dirn, 'ScriptCode', 'README.txt'), 'w') fp.write( "The ScriptCode directory is a place to put custom scripts and programs" ) fp.close() # Make a 'fastqs' directory if fastq_dir is None: fastq_dir = "fastqs" fastq_dir = os.path.join(self.dirn, fastq_dir) bcf_utils.mkdir(fastq_dir, mode=0775) # Check for & create links to fastq files if fastqs is None: # Make a list of fastqs to import from the supplied # IlluminaProject object fastqs = [] if illumina_project is not None: for sample in illumina_project.samples: for fastq in sample.fastq: fastqs.append(os.path.join(sample.dirn, fastq)) if short_fastq_names: # Get mapping to (shortened) unique names fastq_names = IlluminaData.get_unique_fastq_names(fastqs) else: # Use full names fastq_names = {} for fq in fastqs: fastq_names[fq] = os.path.basename(fq) for fastq in fastqs: target_fq = os.path.join(fastq_dir, fastq_names[fastq]) if os.path.exists(target_fq): logger.warning("Target '%s' already exists" % target_fq) else: if link_to_fastqs: logger.debug("Making symlink to %s" % fastq) bcf_utils.mklink(fastq, target_fq, relative=True) else: logger.debug("Making hard link to %s" % fastq) os.link(fastq, target_fq) # Populate self.populate(fastq_dir=os.path.basename(fastq_dir)) # Update metadata: primary fastq dir self.info['primary_fastq_dir'] = os.path.relpath(fastq_dir, self.dirn) # Update metadata: sample summary self.info['samples'] = self.sample_summary() # Save metadata self.info.save(self.info_file)
def setup_qc_dir(self, qc_dir=None, fastq_dir=None): """ Set up a QC outputs directory Creates a QC outputs directory with a metadata file 'qc.info'. Arguments: qc_dir (str): path to QC outputs directory to set up. If a relative path is supplied then is assumed to be relative to the analysis project directory. If 'None' then defaults to the current 'qc_dir' for the project. fastq_dir (str): set the associated source Fastq directory (optional). If 'None' then defaults to the previously associated fastq_dir for the QC dir (or the current 'fastq_dir' for the project if that isn't set). Returns: String: full path to the QC directory. Raises: Exception: if previously stored Fastq source dir doesn't match the one supplied via 'fastq_dir'. """ print "Setting up QC directory" if qc_dir is None: qc_dir = os.path.relpath(self.qc_dir, self.dirn) print "Assuming default QC dir: %s" % qc_dir if not os.path.isabs(qc_dir): qc_dir = os.path.join(self.dirn, qc_dir) if not os.path.exists(qc_dir): print "Creating QC dir: %s" % qc_dir bcf_utils.mkdir(qc_dir, mode=0775) else: print "QC dir already exists: %s" % qc_dir # Set up metadata qc_info = self.qc_info(qc_dir) print "qc_dir : %s" % qc_dir print "Supplied fastq_dir: %s" % fastq_dir print "Stored fastq_dir : %s" % qc_info.fastq_dir if fastq_dir is None: if qc_info.fastq_dir is not None: fastq_dir = qc_info.fastq_dir print "Using stored Fastq dir for this QC dir" else: fastq_dir = os.path.relpath(self.fastq_dir, self.dirn) print "Assuming default Fastq dir: %s" % fastq_dir if qc_info.fastq_dir is not None: if qc_info.fastq_dir != fastq_dir: raise Exception( "Project '%s': supplied Fastq dir ('%s') " "differs from stored dir ('%s') for QC " "dir '%s'" % (self.name, fastq_dir, qc_info.fastq_dir, qc_dir)) print "Setting associated Fastq dir: %s" % fastq_dir qc_info['fastq_dir'] = fastq_dir qc_info.save() # Return the path to the QC directory return qc_dir
def run(self,nthreads=None,fastq_strand_indexes=None, fastq_subset=None,working_dir=None,log_file=None, batch_size=None,max_jobs=1,poll_interval=5, runners=None,default_runner=None,verbose=False): """ Run the tasks in the pipeline Arguments: nthreads (int): number of threads/processors to use for QC jobs (defaults to 1) fastq_strand_indexes (dict): mapping of organism IDs to directories with STAR index fastq_subset (int): explicitly specify the subset size for subsetting running Fastqs working_dir (str): optional path to a working directory (defaults to temporary directory in the current directory) log_dir (str): path of directory where log files will be written to batch_size (int): if set then run commands in each task in batches, with each batch running this many commands at a time (default is to run one command per job) max_jobs (int): optional maximum number of concurrent jobs in scheduler (defaults to 1) poll_interval (float): optional polling interval (seconds) to set in scheduler (defaults to 5s) runners (dict): mapping of names to JobRunner instances; valid names are 'qc_runner', 'report_runner','verify_runner','default' default_runner (JobRunner): optional default job runner to use verbose (bool): if True then report additional information for diagnostics """ # Working directory clean_up_on_completion = False if working_dir is None: working_dir = tempfile.mkdtemp(prefix="__qc.", suffix=".tmp", dir=os.getcwd()) clean_up_on_completion = True working_dir = os.path.abspath(working_dir) if not os.path.exists(working_dir): mkdir(working_dir) # Log and script directories log_dir = os.path.join(working_dir,"logs") scripts_dir = os.path.join(working_dir,"scripts") # Execute the pipeline status = Pipeline.run(self, working_dir=working_dir, log_dir=log_dir, scripts_dir=scripts_dir, log_file=log_file, batch_size=batch_size, exit_on_failure=PipelineFailure.DEFERRED, params={ 'nthreads': nthreads, 'fastq_subset': fastq_subset, 'fastq_strand_indexes': fastq_strand_indexes, }, max_jobs=max_jobs, runners=runners, default_runner=default_runner, verbose=verbose) # Clean up working dir if status == 0 and clean_up_on_completion: shutil.rmtree(working_dir) # Return pipeline status return status