def cellranger_info(path=None,name=None): """ Retrieve information on the cellranger software If called without any arguments this will locate the first cellranger executable that is available on the user's PATH, and attempts to extract the version. Alternatively if the path to an executable is supplied then the version will be determined from that instead. If no version is identified then the script path is still returned, but without any version info. If a 'path' is supplied then the package name will be taken from the basename; otherwise the package name can be supplied via the 'name' argument. If neither are supplied then the package name defaults to 'cellranger'. Returns: Tuple: tuple consisting of (PATH,PACKAGE,VERSION) where PATH is the full path for the cellranger program, PACKAGE is 'cellranger', and VERSION is the package version. If any value can't be determined then it will be returned as an empty string. """ # Initialise cellranger_path = '' if name is None: if path: name = os.path.basename(path) else: name = 'cellranger' package_name = name package_version = '' # Locate the core script if not path: cellranger_path = find_program(package_name) else: cellranger_path = os.path.abspath(path) # Identify the version if os.path.basename(cellranger_path) == package_name: # Run the program to get the version version_cmd = Command(cellranger_path,'--version') output = version_cmd.subprocess_check_output()[1] for line in output.split('\n'): if line.startswith(package_name): # Extract version from line of the form # cellranger (2.0.1) try: package_version = line.split('(')[-1].strip(')') except Exception as ex: logger.warning("Unable to get version from '%s': %s" % (line,ex)) else: # No package supplied or located logger.warning("Unable to identify cellranger package " "from '%s'" % cellranger_path) # Return what we found return (cellranger_path,package_name,package_version)
def bclconvert_info(path=None): """ Retrieve information on the bcl-convert software If called without any arguments this will locate the first bcl-concert executable that is available on the user's PATH. Alternatively if the path to an executable is supplied then the package name and version will be determined from that instead. If no package is identified then the script path is still returned, but without any version info. Returns: Tuple: tuple consisting of (PATH,PACKAGE,VERSION) where PATH is the full path for the bcl-convert program, and PACKAGE and VERSION the package/version that it belongs to (PACKAGE will be 'BCL Convert' if a matching executable is located). If any value can't be determined then it will be returned as an empty string. """ # Initialise bclconvert_path = '' package_name = '' package_version = '' # Locate the bcl-convert program if not path: bclconvert_path = bcf_utils.find_program('bcl-convert') else: bclconvert_path = os.path.abspath(path) # Identify the version if bclconvert_path: # Run the program to get the version version_cmd = Command('bcl-convert', '-V') output = version_cmd.subprocess_check_output()[1] print(output) for line in output.split('\n'): if line.startswith('bcl-convert'): # Extract version from line of the form # bcl-convert Version 00.000.000.3.7.5 package_name = 'BCL Convert' try: package_version = '.'.join(line.split('.')[-3:]) except Exception as ex: logger.warning("Unable to get version from '%s': %s" % (line, ex)) else: # No package supplied or located logger.warning("Unable to identify BCLConvert package from '%s'" % bclconvert_path) # Return what we found return (bclconvert_path, package_name, package_version)
def info_func(p): name = os.path.basename(p) exe = find_program(p) version = '' output = Command(exe).subprocess_check_output()[1] for line in output.split('\n'): if line.startswith(name): try: version = line.split()[1] except IndexError: pass break return (exe, name.upper(), version)
def __init__(self, conda=None, env_dir=None, channels=None): """ Create a new CondaWrapper instance Arguments: conda (str): path to conda executable env_dir (str): optional, non-default directory for conda environments channels (list): optional, list of non-default channels to use for installing packages """ # Conda executable if conda is None: conda = find_program("conda") self._conda = conda if self._conda: self._conda = os.path.abspath(self._conda) conda_dir = os.sep.join(self._conda.split(os.sep)[:-2]) else: conda_dir = None self._conda_dir = conda_dir # Default location for environments if env_dir: env_dir = os.path.abspath(env_dir) elif self._conda_dir: env_dir = os.path.join(self._conda_dir, 'envs') self._env_dir = env_dir # Channels if channels: channels = [c for c in channels] elif channels is None: channels = DEFAULT_CONDA_CHANNELS else: channels = list() self._channels = channels # Lock for blocking operations self._lock_manager = ResourceLock()
# Screen files mammalian_conf = args.mammalian_conf if mammalian_conf is not None: mammalian_conf = os.path.abspath(mammalian_conf) contaminants_conf = args.contaminants_conf if contaminants_conf is not None: contaminants_conf = os.path.abspath(contaminants_conf) # Check for underlying programs required = ["fastq_screen"] if args.aligner is not None: required.append(args.aligner) else: logging.warning("Aligner not specified, cannot check") for prog in required: if find_program(prog) is None: logging.critical("couldn't find '%s'" % prog) sys.exit(1) # Make output dir if args.out_dir is not None: out_dir = os.path.abspath(args.out_dir) mkdir(out_dir) else: out_dir = os.getcwd() # Screen against 'mammalian' genomes tagged_fastq = fastq_screen_tag(mammalian_conf, fqr2, aligner=args.aligner, threads=args.threads,
def has_exe(self): """Check if the command executable exists """ return (find_program(self.command) is not None)
def fastq_strand(argv, working_dir=None): """ Driver for fastq_strand Generate strandedness statistics for single FASTQ or FASTQ pair, by running STAR using one or more genome indexes """ # Process command line p = argparse.ArgumentParser( description="Generate strandedness statistics " "for FASTQ or FASTQpair, by running STAR using " "one or more genome indexes", version=__version__) p.add_argument("r1", metavar="READ1", default=None, help="R1 Fastq file") p.add_argument("r2", metavar="READ2", default=None, nargs="?", help="R2 Fastq file") p.add_argument("-g", "--genome", dest="star_genomedirs", metavar="GENOMEDIR", default=None, action="append", help="path to directory with STAR index " "for genome to use (use as an alternative " "to -c/--conf; can be specified multiple " "times to include additional genomes)") p.add_argument("--subset", type=int, default=10000, help="use a random subset of read pairs " "from the input Fastqs; set to zero to " "use all reads (default: 10000)") p.add_argument("-o", "--outdir", default=None, help="specify directory to write final " "outputs to (default: current directory)") p.add_argument("-c", "--conf", metavar="FILE", default=None, help="specify delimited 'conf' file with " "list of NAME and STAR index directory " "pairs. NB if a conf file is supplied " "then any indices specifed on the command " "line will be ignored") p.add_argument("-n", type=int, default=1, help="number of threads to run STAR with " "(default: 1)") p.add_argument("--counts", action="store_true", help="include the count sums for " "unstranded, 1st read strand aligned and " "2nd read strand aligned in the output " "file (default: only include percentages)") p.add_argument("--keep-star-output", action="store_true", help="keep the output from STAR (default: " "delete outputs on completion)") args = p.parse_args(argv) # Print parameters print("READ1\t: %s" % args.r1) print("READ2\t: %s" % args.r2) # Check that STAR is on the path star_exe = find_program("STAR") if star_exe is None: logging.critical("STAR not found") return 1 print("STAR\t: %s" % star_exe) # Gather genome indices genome_names = {} if args.conf is not None: print("Conf file\t: %s" % args.conf) star_genomedirs = [] with open(args.conf, 'r') as fp: for line in fp: if line.startswith('#'): continue name, star_genomedir = line.rstrip().split('\t') star_genomedirs.append(star_genomedir) # Store an associated name genome_names[star_genomedir] = name else: star_genomedirs = args.star_genomedirs if not star_genomedirs: logging.critical("No genome indices specified") return 1 print("Genomes:") for genome in star_genomedirs: print("- %s" % genome) # Output directory if args.outdir is None: outdir = os.getcwd() else: outdir = os.path.abspath(args.outdir) if not os.path.exists(outdir): logging.critical("Output directory doesn't exist: %s" % outdir) return 1 # Output file outfile = "%s_fastq_strand.txt" % os.path.join( outdir, os.path.basename(strip_ngs_extensions(args.r1))) if os.path.exists(outfile): logging.warning("Removing existing output file '%s'" % outfile) os.remove(outfile) # Prefix for temporary output prefix = "fastq_strand_" # Working directory if working_dir is None: working_dir = os.getcwd() else: working_dir = os.path.abspath(working_dir) if not os.path.isdir(working_dir): raise Exception("Bad working directory: %s" % working_dir) print("Working directory: %s" % working_dir) # Make subset of input read pairs nreads = sum(1 for i in getreads(os.path.abspath(args.r1))) print("%d reads" % nreads) if args.subset == 0: print("Using all read pairs in Fastq files") subset = nreads elif args.subset > nreads: print("Actual number of read pairs smaller than requested subset") subset = nreads else: subset = args.subset print("Using random subset of %d read pairs" % subset) if subset == nreads: subset_indices = [i for i in xrange(nreads)] else: subset_indices = random.sample(xrange(nreads), subset) fqs_in = filter(lambda fq: fq is not None, (args.r1, args.r2)) fastqs = [] for fq in fqs_in: fq_subset = os.path.join(working_dir, os.path.basename(fq)) if fq_subset.endswith(".gz"): fq_subset = '.'.join(fq_subset.split('.')[:-1]) fq_subset = "%s.subset.fq" % '.'.join(fq_subset.split('.')[:-1]) with open(fq_subset, 'w') as fp: for read in getreads_subset(os.path.abspath(fq), subset_indices): fp.write('\n'.join(read) + '\n') fastqs.append(fq_subset) # Make directory to keep output from STAR if args.keep_star_output: star_output_dir = os.path.join( outdir, "STAR.%s.outputs" % os.path.basename(strip_ngs_extensions(args.r1))) print("Output from STAR will be copied to %s" % star_output_dir) # Check if directory already exists from earlier run if os.path.exists(star_output_dir): # Move out of the way i = 0 backup_dir = "%s.bak" % star_output_dir while os.path.exists(backup_dir): i += 1 backup_dir = "%s.bak%s" % (star_output_dir, i) logging.warning("Moving existing output directory to %s" % backup_dir) os.rename(star_output_dir, backup_dir) # Make the directory os.mkdir(star_output_dir) # Write output to a temporary file with tempfile.TemporaryFile() as fp: # Iterate over genome indices for star_genomedir in star_genomedirs: # Basename for output for this genome try: name = genome_names[star_genomedir] except KeyError: name = star_genomedir # Build a command line to run STAR star_cmd = [star_exe] star_cmd.extend([ '--runMode', 'alignReads', '--genomeLoad', 'NoSharedMemory', '--genomeDir', os.path.abspath(star_genomedir) ]) star_cmd.extend(['--readFilesIn', fastqs[0]]) if len(fastqs) > 1: star_cmd.append(fastqs[1]) star_cmd.extend([ '--quantMode', 'GeneCounts', '--outSAMtype', 'BAM', 'Unsorted', '--outSAMstrandField', 'intronMotif', '--outFileNamePrefix', prefix, '--runThreadN', str(args.n) ]) print("Running %s" % ' '.join(star_cmd)) try: subprocess.check_output(star_cmd, cwd=working_dir) except subprocess.CalledProcessError as ex: raise Exception("STAR returned non-zero exit code: %s" % ex.returncode) # Save the outputs if args.keep_star_output: # Make a subdirectory for this genome index genome_dir = os.path.join(star_output_dir, name.replace(os.sep, "_")) print("Copying STAR outputs to %s" % genome_dir) os.mkdir(genome_dir) for f in os.listdir(working_dir): if f.startswith(prefix): shutil.copy(os.path.join(working_dir, f), os.path.join(genome_dir, f)) # Process the STAR output star_tab_file = os.path.join(working_dir, "%sReadsPerGene.out.tab" % prefix) if not os.path.exists(star_tab_file): raise Exception("Failed to find .out file: %s" % star_tab_file) sum_col2 = 0 sum_col3 = 0 sum_col4 = 0 with open(star_tab_file) as out: for i, line in enumerate(out): if i < 4: # Skip first four lines continue # Process remaining delimited columns cols = line.rstrip('\n').split('\t') sum_col2 += int(cols[1]) sum_col3 += int(cols[2]) sum_col4 += int(cols[3]) print("Sums:") print("- col2: %d" % sum_col2) print("- col3: %d" % sum_col3) print("- col4: %d" % sum_col4) if sum_col2 > 0.0: forward_1st = float(sum_col3) / float(sum_col2) * 100.0 reverse_2nd = float(sum_col4) / float(sum_col2) * 100.0 else: logging.warning("Sum of mapped reads is zero!") forward_1st = 0.0 reverse_2nd = 0.0 print("Strand percentages:") print("- 1st forward: %.2f%%" % forward_1st) print("- 2nd reverse: %.2f%%" % reverse_2nd) # Append to output file data = [name, "%.2f" % forward_1st, "%.2f" % reverse_2nd] if args.counts: data.extend([sum_col2, sum_col3, sum_col4]) fp.write("%s\n" % "\t".join([str(d) for d in data])) # Finished iterating over genomes # Rewind temporary output file fp.seek(0) with open(outfile, 'w') as out: # Header out.write("#fastq_strand version: %s\t" "#Aligner: %s\t" "#Reads in subset: %s\n" % (__version__, "STAR", subset)) columns = ["Genome", "1st forward", "2nd reverse"] if args.counts: columns.extend([ "Unstranded", "1st read strand aligned", "2nd read strand aligned" ]) out.write("#%s\n" % "\t".join(columns)) # Copy content from temp to final file for line in fp: out.write(line) return 0
def main(): """ """ # Load configuration settings = Settings() # Collect defaults default_runner = settings.runners.rsync # Get pre-defined destinations destinations = [name for name in settings.destination] # Command line p = argparse.ArgumentParser( description="Transfer copies of Fastq data from an analysis " "project to an arbitrary destination for sharing with other " "people") p.add_argument('--version', action='version', version=("%%(prog)s %s" % get_version())) p.add_argument('--subdir', action='store', choices=('random_bin', 'run_id'), default=None, help="subdirectory naming scheme: 'random_bin' " "locates a random pre-existing empty subdirectory " "under the target directory; 'run_id' creates a " "new subdirectory " "'PLATFORM_DATESTAMP.RUN_ID-PROJECT'. If this " "option is not set then no subdirectory will be " "used") p.add_argument('--readme', action='store', metavar='README_TEMPLATE', dest='readme_template', help="template file to generate README file from; " "can be full path to a template file, or the name " "of a file in the 'templates' directory") p.add_argument('--weburl', action='store', help="base URL for webserver (sets the value of " "the WEBURL variable in the template README)") p.add_argument('--include_downloader', action='store_true', help="copy the 'download_fastqs.py' utility to the " "final location") p.add_argument('--include_qc_report', action='store_true', help="copy the zipped QC reports to the final " "location") p.add_argument('--include_10x_outputs', action='store_true', help="copy outputs from 10xGenomics pipelines (e.g. " "'cellranger count') to the final location") p.add_argument('--link', action='store_true', help="hard link files instead of copying") p.add_argument('--runner', action='store', help="specify the job runner to use for executing " "the checksumming, Fastq copy and tar gzipping " "operations (defaults to job runner defined for " "copying in config file [%s])" % default_runner) p.add_argument('dest', action='store', metavar="DEST", help="destination to copy Fastqs to; can be the " "name of a destination defined in the configuration " "file, or an arbitrary location of the form " "'[[USER@]HOST:]DIR' (%s)" % (("available destinations: %s" % (','.join("'%s'" % d for d in sorted(destinations)))) if destinations else "no destinations currently defined")) p.add_argument('project', action='store', metavar="PROJECT", help="path to project directory (or to a Fastqs " "subdirectory in a project) to copy Fastqs from") # Process command line args = p.parse_args() # Check if target is pre-defined destination if args.dest in destinations: print("Loading settings for destination '%s'" % args.dest) dest = settings.destination[args.dest] target_dir = dest.directory readme_template = dest.readme_template subdir = dest.subdir include_downloader = dest.include_downloader include_qc_report = dest.include_qc_report hard_links = dest.hard_links weburl = dest.url else: target_dir = args.dest readme_template = None subdir = None include_downloader = False include_qc_report = False hard_links = False weburl = None # Update defaults with command line values if args.readme_template: readme_template = args.readme_template if args.subdir: subdir = args.subdir if args.include_downloader: include_downloader = True if args.include_qc_report: include_qc_report = True if args.weburl: weburl = args.weburl if args.link: hard_links = args.link # Sort out project directory project = AnalysisProject(args.project) if not project.is_analysis_dir: # Assume it's the Fastq dir fastq_dir = os.path.basename(args.project) project = AnalysisProject(os.path.dirname(args.project)) else: fastq_dir = None if not project.is_analysis_dir: logger.error("'%s': project not found" % args.project) return 1 project_name = project.name # Parent analysis directory analysis_dir = AnalysisDir(os.path.dirname(project.dirn)) # Fastqs directory try: project.use_fastq_dir(fastq_dir) except Exception as ex: logger.error("'%s': failed to load Fastq set '%s': %s" % (project.name, fastq_dir, ex)) return 1 # Report print("Transferring data from '%s' (%s)" % (project.name, project.dirn)) print("Fastqs in %s" % project.fastq_dir) # Summarise samples and Fastqs samples = set() nfastqs = 0 fsize = 0 for sample in project.samples: samples.add(sample.name) for fq in sample.fastq: fsize += os.lstat(fq).st_size nfastqs += 1 nsamples = len(samples) dataset = "%s%s dataset" % ("%s " % project.info.single_cell_platform if project.info.single_cell_platform else '', project.info.library_type) endedness = "paired-end" if project.info.paired_end else "single-end" print("%s with %d Fastqs from %d %s sample%s totalling %s" % (dataset, nfastqs, nsamples, endedness, 's' if nsamples != 1 else '', format_file_size(fsize))) # Check target dir if not Location(target_dir).is_remote: target_dir = os.path.abspath(target_dir) if not exists(target_dir): print("'%s': target directory not found" % target_dir) return else: print("Target directory %s" % target_dir) # Locate downloader if include_downloader: print("Locating downloader for inclusion") downloader = find_program("download_fastqs.py") if downloader is None: logging.error("Unable to locate download_fastqs.py") return 1 print("... found %s" % downloader) else: downloader = None # Locate zipped QC report if include_qc_report: print("Locating zipped QC reports for inclusion") qc_zips = list() # Check QC directories and look for zipped reports for qc_dir in project.qc_dirs: # Get the associated Fastq set # NB only compare the basename of the Fastq dir # in case full paths weren't updated fq_set = os.path.basename(project.qc_info(qc_dir).fastq_dir) if fq_set == os.path.basename(project.fastq_dir): for qc_base in ( "%s_report.%s.%s" % (qc_dir, project.name, project.info.run), "%s_report.%s.%s" % (qc_dir, project.name, os.path.basename(analysis_dir.analysis_dir)), ): qc_zip = os.path.join(project.dirn, "%s.zip" % qc_base) if os.path.exists(qc_zip): print("... found %s" % qc_zip) qc_zips.append(qc_zip) if not qc_zips: logger.error("No zipped QC reports found") return 1 else: qc_zips = None # Locate 10xGenomics outputs if args.include_10x_outputs: print("Locating outputs from 10xGenomics pipelines for " "inclusion") cellranger_dirs = list() for d in ( 'cellranger_count', 'cellranger_multi', ): cellranger_dir = os.path.join(project.dirn, d) if os.path.isdir(cellranger_dir): print("... found %s" % cellranger_dir) cellranger_dirs.append(cellranger_dir) if not cellranger_dirs: logger.error("No outputs from 10xGenomics pipelines found") return 1 else: cellranger_dirs = None # Determine subdirectory if subdir == "random_bin": # Find a random empty directory under the # target directory print("Locating random empty bin") subdirs = [ d for d in os.listdir(target_dir) if os.path.isdir(os.path.join(target_dir, d)) ] if not subdirs: print("Failed to locate subdirectories") return shuffle(subdirs) subdir = None for d in subdirs: if not os.listdir(os.path.join(target_dir, d)): # Empty bin subdir = d break if subdir is None: print("Failed to locate empty subdirectory") return print("... found '%s'" % subdir) # Update target dir target_dir = os.path.join(target_dir, subdir) elif subdir == "run_id": # Construct subdirectory name based on the # run ID subdir = "{platform}_{datestamp}.{run_number}-{project}".format( platform=analysis_dir.metadata.platform.upper(), datestamp=analysis_dir.metadata.instrument_datestamp, run_number=analysis_dir.metadata.run_number, project=project.name) # Check it doesn't already exist if exists(os.path.join(target_dir, subdir)): logger.error("'%s': subdirectory already exists" % subdir) return print("Using subdirectory '%s'" % subdir) # Update target dir target_dir = os.path.join(target_dir, subdir) # Make target directory if not exists(target_dir): mkdir(target_dir) # Get runner for copy job if args.runner: runner = fetch_runner(args.runner) else: runner = default_runner # Set identifier for jobs job_id = "%s%s" % (project_name, (".%s" % fastq_dir if fastq_dir is not None else '')) # Set the working directory working_dir = os.path.abspath("transfer.%s.%s" % (job_id, int(time.time()))) mkdir(working_dir) print("Created working dir %s" % working_dir) # Construct the README if readme_template: # Check that template file exists print("Locating README template") template = None for filen in ( readme_template, os.path.join(get_templates_dir(), readme_template), ): if os.path.exists(filen): template = filen break if template is None: logger.error("'%s': template file not found" % readme_template) return 1 else: readme_template = template print("... found %s" % readme_template) # Read in template with open(readme_template, 'rt') as fp: readme = fp.read() # Substitute template variables template_vars = { 'PLATFORM': analysis_dir.metadata.platform.upper(), 'RUN_NUMBER': analysis_dir.metadata.run_number, 'DATESTAMP': analysis_dir.metadata.instrument_datestamp, 'PROJECT': project_name, 'WEBURL': weburl, 'BIN': subdir, 'DIR': target_dir, 'TODAY': date.today().strftime("%d/%m/%Y"), } for var in template_vars: value = template_vars[var] if value is None: value = '?' else: value = str(value) readme = re.sub(r"%{var}%".format(var=var), value, readme) # Write out a temporary README file readme_file = os.path.join(working_dir, "README") with open(readme_file, 'wt') as fp: fp.write(readme) else: # No README readme_file = None # Start a scheduler to run jobs sched = SimpleScheduler(runner=runner, reporter=TransferDataSchedulerReporter(), poll_interval=settings.general.poll_interval) sched.start() # Build command to run manage_fastqs.py copy_cmd = Command("manage_fastqs.py") if hard_links: copy_cmd.add_args("--link") copy_cmd.add_args(analysis_dir.analysis_dir, project_name) if fastq_dir is not None: copy_cmd.add_args(fastq_dir) copy_cmd.add_args("copy", target_dir) print("Running %s" % copy_cmd) copy_job = sched.submit(copy_cmd.command_line, name="copy.%s" % job_id, wd=working_dir) # Copy README if readme_file is not None: print("Copying README file") copy_cmd = copy_command(readme_file, os.path.join(target_dir, "README")) sched.submit(copy_cmd.command_line, name="copy.%s.readme" % job_id, runner=SimpleJobRunner(), wd=working_dir) # Copy download_fastqs.py if downloader: print("Copying downloader") copy_cmd = copy_command( downloader, os.path.join(target_dir, os.path.basename(downloader))) sched.submit(copy_cmd.command_line, name="copy.%s.downloader" % job_id, runner=SimpleJobRunner(), wd=working_dir) # Copy QC reports if qc_zips: for qc_zip in qc_zips: print("Copying '%s'" % os.path.basename(qc_zip)) copy_cmd = copy_command(qc_zip, os.path.join(target_dir, os.path.basename(qc_zip)), link=hard_links) sched.submit(copy_cmd.command_line, name="copy.%s.%s" % (job_id, os.path.basename(qc_zip)), runner=SimpleJobRunner(), wd=working_dir) # Tar and copy 10xGenomics outputs if cellranger_dirs: for cellranger_dir in cellranger_dirs: print("Tar gzipping and copying '%s'" % os.path.basename(cellranger_dir)) # Tar & gzip data targz = os.path.join( working_dir, "%s.%s.%s.tgz" % (os.path.basename(cellranger_dir), project_name, project.info.run)) targz_cmd = Command("tar", "czvhf", targz, "-C", os.path.dirname(cellranger_dir), os.path.basename(cellranger_dir)) print("Running %s" % targz_cmd) targz_job = sched.submit( targz_cmd.command_line, name="targz.%s.%s" % (job_id, os.path.basename(cellranger_dir)), wd=working_dir) # Copy the targz file copy_cmd = copy_command( targz, os.path.join(target_dir, os.path.basename(targz))) print("Running %s" % copy_cmd) copy_job = sched.submit(copy_cmd.command_line, name="copytgz.%s.%s" % (job_id, os.path.basename(cellranger_dir)), runner=SimpleJobRunner(), wd=working_dir, wait_for=(targz_job.job_name, )) # Wait for scheduler jobs to complete sched.wait() # Check exit code for Fastq copying exit_code = copy_job.exit_code if exit_code != 0: logger.error("File copy exited with an error") return exit_code else: print("Files now at %s" % target_dir) if weburl: url = weburl if subdir is not None: url = os.path.join(url, subdir) print("URL: %s" % url) print("Done")
def fastq_strand(argv,working_dir=None): """ Driver for fastq_strand Generate strandedness statistics for single FASTQ or FASTQ pair, by running STAR using one or more genome indexes """ # Process command line p = argparse.ArgumentParser( description="Generate strandedness statistics " "for FASTQ or FASTQpair, by running STAR using " "one or more genome indexes", version=__version__) p.add_argument("r1",metavar="READ1", default=None, help="R1 Fastq file") p.add_argument("r2",metavar="READ2", default=None, nargs="?", help="R2 Fastq file") p.add_argument("-g","--genome", dest="star_genomedirs",metavar="GENOMEDIR", default=None, action="append", help="path to directory with STAR index " "for genome to use (use as an alternative " "to -c/--conf; can be specified multiple " "times to include additional genomes)") p.add_argument("--subset", type=int, default=10000, help="use a random subset of read pairs " "from the input Fastqs; set to zero to " "use all reads (default: 10000)") p.add_argument("-o","--outdir", default=None, help="specify directory to write final " "outputs to (default: current directory)") p.add_argument("-c","--conf",metavar="FILE", default=None, help="specify delimited 'conf' file with " "list of NAME and STAR index directory " "pairs. NB if a conf file is supplied " "then any indices specifed on the command " "line will be ignored") p.add_argument("-n", type=int, default=1, help="number of threads to run STAR with " "(default: 1)") p.add_argument("--counts", action="store_true", help="include the count sums for " "unstranded, 1st read strand aligned and " "2nd read strand aligned in the output " "file (default: only include percentages)") p.add_argument("--keep-star-output", action="store_true", help="keep the output from STAR (default: " "delete outputs on completion)") args = p.parse_args(argv) # Print parameters print "READ1\t: %s" % args.r1 print "READ2\t: %s" % args.r2 # Check that STAR is on the path star_exe = find_program("STAR") if star_exe is None: logging.critical("STAR not found") return 1 print "STAR\t: %s" % star_exe # Gather genome indices genome_names = {} if args.conf is not None: print "Conf file\t: %s" % args.conf star_genomedirs = [] with open(args.conf,'r') as fp: for line in fp: if line.startswith('#'): continue name,star_genomedir = line.rstrip().split('\t') star_genomedirs.append(star_genomedir) # Store an associated name genome_names[star_genomedir] = name else: star_genomedirs = args.star_genomedirs if not star_genomedirs: logging.critical("No genome indices specified") return 1 print "Genomes:" for genome in star_genomedirs: print "- %s" % genome # Output directory if args.outdir is None: outdir = os.getcwd() else: outdir = os.path.abspath(args.outdir) if not os.path.exists(outdir): logging.critical("Output directory doesn't exist: %s" % outdir) return 1 # Output file outfile = "%s_fastq_strand.txt" % os.path.join( outdir, os.path.basename(strip_ngs_extensions(args.r1))) if os.path.exists(outfile): logging.warning("Removing existing output file '%s'" % outfile) os.remove(outfile) # Prefix for temporary output prefix = "fastq_strand_" # Working directory if working_dir is None: working_dir = os.getcwd() else: working_dir = os.path.abspath(working_dir) if not os.path.isdir(working_dir): raise Exception("Bad working directory: %s" % working_dir) print "Working directory: %s" % working_dir # Make subset of input read pairs nreads = sum(1 for i in getreads(os.path.abspath(args.r1))) print "%d reads" % nreads if args.subset == 0: print "Using all read pairs in Fastq files" subset = nreads elif args.subset > nreads: print "Actual number of read pairs smaller than requested subset" subset = nreads else: subset = args.subset print "Using random subset of %d read pairs" % subset if subset == nreads: subset_indices = [i for i in xrange(nreads)] else: subset_indices = random.sample(xrange(nreads),subset) fqs_in = filter(lambda fq: fq is not None,(args.r1,args.r2)) fastqs = [] for fq in fqs_in: fq_subset = os.path.join(working_dir, os.path.basename(fq)) if fq_subset.endswith(".gz"): fq_subset = '.'.join(fq_subset.split('.')[:-1]) fq_subset = "%s.subset.fq" % '.'.join(fq_subset.split('.')[:-1]) with open(fq_subset,'w') as fp: for read in getreads_subset(os.path.abspath(fq), subset_indices): fp.write('\n'.join(read) + '\n') fastqs.append(fq_subset) # Make directory to keep output from STAR if args.keep_star_output: star_output_dir = os.path.join(outdir, "STAR.%s.outputs" % os.path.basename( strip_ngs_extensions(args.r1))) print "Output from STAR will be copied to %s" % star_output_dir # Check if directory already exists from earlier run if os.path.exists(star_output_dir): # Move out of the way i = 0 backup_dir = "%s.bak" % star_output_dir while os.path.exists(backup_dir): i += 1 backup_dir = "%s.bak%s" % (star_output_dir,i) logging.warning("Moving existing output directory to %s" % backup_dir) os.rename(star_output_dir,backup_dir) # Make the directory os.mkdir(star_output_dir) # Write output to a temporary file with tempfile.TemporaryFile() as fp: # Iterate over genome indices for star_genomedir in star_genomedirs: # Basename for output for this genome try: name = genome_names[star_genomedir] except KeyError: name = star_genomedir # Build a command line to run STAR star_cmd = [star_exe] star_cmd.extend([ '--runMode','alignReads', '--genomeLoad','NoSharedMemory', '--genomeDir',os.path.abspath(star_genomedir)]) star_cmd.extend(['--readFilesIn', fastqs[0]]) if len(fastqs) > 1: star_cmd.append(fastqs[1]) star_cmd.extend([ '--quantMode','GeneCounts', '--outSAMtype','BAM','Unsorted', '--outSAMstrandField','intronMotif', '--outFileNamePrefix',prefix, '--runThreadN',str(args.n)]) print "Running %s" % ' '.join(star_cmd) try: subprocess.check_output(star_cmd,cwd=working_dir) except subprocess.CalledProcessError as ex: raise Exception("STAR returned non-zero exit code: %s" % ex.returncode) # Save the outputs if args.keep_star_output: # Make a subdirectory for this genome index genome_dir = os.path.join(star_output_dir, name.replace(os.sep,"_")) print "Copying STAR outputs to %s" % genome_dir os.mkdir(genome_dir) for f in os.listdir(working_dir): if f.startswith(prefix): shutil.copy(os.path.join(working_dir,f), os.path.join(genome_dir,f)) # Process the STAR output star_tab_file = os.path.join(working_dir, "%sReadsPerGene.out.tab" % prefix) if not os.path.exists(star_tab_file): raise Exception("Failed to find .out file: %s" % star_tab_file) sum_col2 = 0 sum_col3 = 0 sum_col4 = 0 with open(star_tab_file) as out: for i,line in enumerate(out): if i < 4: # Skip first four lines continue # Process remaining delimited columns cols = line.rstrip('\n').split('\t') sum_col2 += int(cols[1]) sum_col3 += int(cols[2]) sum_col4 += int(cols[3]) print "Sums:" print "- col2: %d" % sum_col2 print "- col3: %d" % sum_col3 print "- col4: %d" % sum_col4 if sum_col2 > 0.0: forward_1st = float(sum_col3)/float(sum_col2)*100.0 reverse_2nd = float(sum_col4)/float(sum_col2)*100.0 else: logging.warning("Sum of mapped reads is zero!") forward_1st = 0.0 reverse_2nd = 0.0 print "Strand percentages:" print "- 1st forward: %.2f%%" % forward_1st print "- 2nd reverse: %.2f%%" % reverse_2nd # Append to output file data = [name, "%.2f" % forward_1st, "%.2f" % reverse_2nd] if args.counts: data.extend([sum_col2,sum_col3,sum_col4]) fp.write("%s\n" % "\t".join([str(d) for d in data])) # Finished iterating over genomes # Rewind temporary output file fp.seek(0) with open(outfile,'w') as out: # Header out.write("#fastq_strand version: %s\t" "#Aligner: %s\t" "#Reads in subset: %s\n" % (__version__, "STAR", subset)) columns = ["Genome","1st forward","2nd reverse"] if args.counts: columns.extend(["Unstranded", "1st read strand aligned", "2nd read strand aligned"]) out.write("#%s\n" % "\t".join(columns)) # Copy content from temp to final file for line in fp: out.write(line) return 0
def main(): # Deal with command line p = argparse.ArgumentParser(description="Generate QC report for each " "directory DIR") p.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__) p.add_argument('--protocol', action='store', dest='qc_protocol', default=None, help="explicitly specify QC protocol (must be one of " "%s). Default is to determine the protocol " "automatically (recommended)" % str(','.join(["'%s'" % pr for pr in PROTOCOLS]))) p.add_argument('--qc_dir', action='store', dest='qc_dir', default='qc', help="explicitly specify QC output directory (nb if " "supplied then the same QC_DIR will be used for each " "DIR. Non-absolute paths are assumed to be relative to " "DIR). Default: 'qc'") p.add_argument('--fastq_dir', action='store', dest='fastq_dir', default=None, help="explicitly specify subdirectory of DIRs with " "Fastq files to run the QC on") reporting = p.add_argument_group('Reporting options') reporting.add_argument('-t', '--title', action='store', dest='title', default=None, help="title for output QC reports") reporting.add_argument('-f', '--filename', action='store', dest='filename', default=None, help="file name for output HTML QC report " "(default: <DIR>/<QC_DIR>_report.html)") reporting.add_argument('--zip', action='store_true', dest='zip', default=False, help="make ZIP archive for the QC report") reporting.add_argument('--multiqc', action='store_true', dest='multiqc', default=False, help="generate MultiQC report") reporting.add_argument('--force', action='store_true', dest='force', default=False, help="force generation of reports even if " "verification fails") data_dir_group = reporting.add_mutually_exclusive_group() data_dir_group.add_argument('--data-dir', action='store_true', dest='use_data_dir', help="create a data directory with copies " "of QC artefacts needed for the HTML " "report (NB data directory will always " "be created for multi-project reports, " "unless --no-data-dir is specified)") data_dir_group.add_argument('--no-data-dir', action='store_true', dest='no_data_dir', help="don't a data directory with copies " "of QC artefacts (this is the default " "except for multi-project reports)") verification = p.add_argument_group('Verification options') verification.add_argument('--verify', action='store_true', dest='verify', help="verify the QC products only (don't " "write the report); returns exit code 0 " "if QC is verified, 1 if not") deprecated = p.add_argument_group('Deprecated options') deprecated.add_argument('-l', '--list-unverified', action='store_true', dest='list_unverified', default=False, help="deprecated: does nothing (Fastqs with " "missing QC outputs can no longer be listed)") deprecated.add_argument('--strand_stats', action='store_true', dest='fastq_strand', default=False, help="deprecated: does nothing (strand stats " "are automatically included if present)") p.add_argument('dirs', metavar="DIR", nargs='+', help="directory to report QC for; can be a project " "directory (in which case the default QC directory " "will be reported), or a QC directory within a " "project") args = p.parse_args() # Report name and version print("%s version %s" % (os.path.basename(sys.argv[0]), __version__)) # Report arguments if sys.argv[1:]: print("\n%s" % ' '.join( ['"%s"' % arg if ' ' in arg else arg for arg in sys.argv[1:]])) # Report working directory print("\nCWD %s" % os.getcwd()) # Check for MultiQC if required if args.multiqc: if find_program("multiqc") is None: logging.critical("MultiQC report requested but 'multiqc' " "not available") sys.exit(1) # Get projects and QC dirs from supplied directories projects = [] for d in args.dirs: print("\n**** Examining directory %s ****" % d) # Check if directory is a QC dir qc_dir = None # Look for 'qc.info' in current directory if os.path.exists(os.path.join(os.path.abspath(d), 'qc.info')): print("...located 'qc.info', assuming this is QC dir") qc_dir = os.path.abspath(d) # Locate parent project dir metadata_file = locate_project_info_file(qc_dir) if metadata_file is not None: p = AnalysisProject(os.path.dirname(metadata_file)) print("...located parent project: %s" % p.dirn) else: # Unable to locate project directory print("...failed to locate parent project metadata file") # Fall back to location of Fastq files qc_info = AnalysisProjectQCDirInfo( os.path.join(qc_dir, 'qc.info')) if qc_info.fastq_dir is not None: project_dir = os.path.abspath(qc_info.fastq_dir) if os.path.basename(project_dir).startswith('fastqs'): # Use the next level up project_dir = os.path.dirname(project_dir) print("...putative parent project dir: %s (from " " Fastq dir)" % project_dir) p = AnalysisProject(project_dir) else: # Failed to locate Fastqs logging.fatal("Unable to locate parent project") # Exit with an error sys.exit(1) # Issue a warning if a QC dir was explicitly # specified on the command line if args.qc_dir is not None: logging.warning("--qc_dir has been ignored for this " "directory") else: # Assume directory is a project p = AnalysisProject(os.path.abspath(d)) print("...assuming this is a project dir") # Identify the QC directory if args.qc_dir is None: qc_dir = p.qc_dir else: qc_dir = args.qc_dir if not os.path.isabs(qc_dir): qc_dir = os.path.join(p.dirn, qc_dir) print("...QC directory: %s" % qc_dir) # Explicitly set the QC directory location) p.use_qc_dir(qc_dir) # Locate the Fastq dir qc_info = p.qc_info(qc_dir) if args.fastq_dir is None: fastq_dir = qc_info.fastq_dir if fastq_dir is None: fastq_dir = p.fastq_dir else: fastq_dir = args.fastq_dir if qc_info.fastq_dir is not None: if os.path.join(p.dirn, qc_info.fastq_dir) != fastq_dir: logging.warning("Stored fastq dir mismatch " "(%s != %s)" % (fastq_dir, qc_info.fastq_dir)) print("...using Fastqs dir: %s" % p.fastq_dir) p.use_fastq_dir(fastq_dir, strict=False) projects.append(p) # Verify QC for projects print("\n**** Verifying QC ****") retval = 0 report_projects = [] for p in projects: print("\nProject: %s" % p.name) print("-" * (len('Project: ') + len(p.name))) print("%d sample%s | %d fastq%s" % ( len(p.samples), 's' if len(p.samples) != 1 else '', len(p.fastqs), 's' if len(p.fastqs) != 1 else '', )) # QC metadata qc_dir = p.qc_dir qc_info = p.qc_info(qc_dir) # Set QC protocol for verification if args.qc_protocol is None: protocol = qc_info.protocol if protocol is None: protocol = determine_qc_protocol(p) else: protocol = args.qc_protocol print("Verifying against QC protocol '%s'" % protocol) # Verification step if len(p.fastqs) == 0: logging.critical("No Fastqs!") verified = False else: try: verified = verify_project(p, qc_dir, protocol) except Exception as ex: logging.critical("Error: %s" % ex) verified = False if not verified: print("Verification: FAILED") if not args.force: retval = 1 continue else: print("--force specified, ignoring previous errors") else: print("Verification: OK") if args.verify: continue report_projects.append(p) # Generate QC report if report_projects: # Set defaults from primary project p = report_projects[0] qc_base = os.path.basename(p.qc_dir) # Filename and location for report if args.filename is None: out_file = '%s_report.html' % qc_base else: out_file = args.filename if not os.path.isabs(out_file): out_file = os.path.join(p.dirn, out_file) out_dir = os.path.dirname(out_file) # MultiQC report if args.multiqc: multiqc_report = os.path.join(out_dir, "multi%s_report.html" % qc_base) # Check if we need to rerun MultiQC if os.path.exists(multiqc_report) and not args.force: run_multiqc = False for p in report_projects: multiqc_mtime = os.path.getmtime(multiqc_report) for f in os.listdir(p.qc_dir): if os.path.getmtime(os.path.join(p.qc_dir,f)) > \ multiqc_mtime: # Input is newer than report run_multiqc = True break else: run_multiqc = True # (Re)run MultiQC if run_multiqc: multiqc_cmd = Command('multiqc', '--title', '%s' % args.title, '--filename', '%s' % multiqc_report, '--force') for p in report_projects: multiqc_cmd.add_args(p.qc_dir) print("\nRunning %s" % multiqc_cmd) multiqc_retval = multiqc_cmd.run_subprocess() if multiqc_retval == 0 and os.path.exists(multiqc_report): print("MultiQC: %s\n" % multiqc_report) else: print("MultiQC: FAILED") retval += 1 else: print("MultiQC: %s (already exists)\n" % multiqc_report) # Create data directory? use_data_dir = (len(projects) > 1) if args.use_data_dir: use_data_dir = True elif args.no_data_dir: use_data_dir = False # Generate report report_html = report(report_projects, title=args.title, filename=out_file, relative_links=True, use_data_dir=use_data_dir, make_zip=args.zip) print("Wrote QC report to %s" % out_file) # Finish with appropriate exit code print("%s completed: exit code %s (%s)" % (os.path.basename(sys.argv[0]), retval, ('ok' if retval == 0 else 'error'))) sys.exit(retval)
def bcl_to_fastq_10x_chromium_sc_atac(ap, output_dir, sample_sheet, primary_data_dir, lanes=None, bases_mask=None, cellranger_jobmode=None, cellranger_maxjobs=None, cellranger_mempercore=None, cellranger_jobinterval=None, cellranger_localcores=None, cellranger_localmem=None, log_dir=None): """ Generate FASTQ files for 10xGenomics single-cell ATAC-seq run Performs FASTQ generation from raw BCL files produced by an Illumina sequencer using the 10xGenomics Chromium single-cell (sc) ATAC-seq protocol, by running 'cellranger-atac mkfastq'. Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to create Fastqs for output_dir (str): output directory for bcl-to-fastq conversion sample_sheet (str): path to input sample sheet file primary_data_dir (str): path to the top-level directory holding the sequencing data bases_mask (str): if set then use this as an alternative bases mask setting (default is to acquire from the autoprocessor parameters) ...TBD... """ # Load run data illumina_run = IlluminaData.IlluminaRun(primary_data_dir, platform=ap.metadata.platform) # Deal with bases mask if bases_mask is None: bases_mask = ap.params.bases_mask if bases_mask == 'auto': # Update bases mask to only use first 8 bases from # first index e.g. I8nnnnnnnn and convert second index # to read e.g. Y16 print "Determining bases mask from RunInfo.xml" bases_mask = get_bases_mask_10x_atac(illumina_run.runinfo_xml) print "Bases mask: %s (updated for 10x scATAC-seq)" % bases_mask if not bases_mask_is_valid(bases_mask): raise Exception("Invalid bases mask: '%s'" % bases_mask) # Check we have cellranger-atac cellranger_atac = find_program('cellranger-atac') if not cellranger_atac: raise Exception("No cellranger package found") cellranger_package_info = cellranger_info(cellranger_atac) print "Using cellranger-atac %s: %s" % \ (cellranger_package_info[-1], cellranger_atac) # Check we have bcl2fastq bcl2fastq = find_program('bcl2fastq') if not bcl2fastq: raise Exception("No bcl2fastq package found") bcl2fastq = available_bcl2fastq_versions( paths=(os.path.dirname(bcl2fastq), ), reqs='>=2.17') if not bcl2fastq: raise Exception("No appropriate bcl2fastq software " "located") bcl2fastq = bcl2fastq[0] bcl2fastq_info = bcl_to_fastq_info(bcl2fastq) print "Using bcl2fastq %s: %s" % (bcl2fastq_info[-1], bcl2fastq) # Store info on bcl2fastq package ap.metadata['bcl2fastq_software'] = bcl2fastq_info # Store info on cellranger package ap.metadata['cellranger_software'] = cellranger_package_info # Put a copy of sample sheet in the log directory shutil.copy(sample_sheet, log_dir) # Determine output directory absolute path if not os.path.isabs(output_dir): output_dir = os.path.join(ap.analysis_dir, output_dir) # Working directory (set to analysis dir) working_dir = ap.analysis_dir # Report values and settings print "Cellranger-atac exe : %s" % cellranger_atac print "Cellranger-atac version: %s %s" % (cellranger_package_info[1], cellranger_package_info[2]) print "Bcl-to-fastq exe : %s" % bcl2fastq print "Bcl-to-fastq version : %s %s" % (bcl2fastq_info[1], bcl2fastq_info[2]) print "Sample sheet : %s" % os.path.basename(sample_sheet) print "Bases mask : %s" % bases_mask print "Cellranger jobmode : %s" % cellranger_jobmode print "Cellranger maxjobs : %s" % cellranger_maxjobs print "Cellranger mempercore : %s" % cellranger_mempercore print "Cellranger jobinterval : %s" % cellranger_jobinterval print "Cellranger localcores : %s" % cellranger_localcores print "Cellranger localmem : %s" % cellranger_localmem print "Working directory : %s" % working_dir print "Log directory : %s" % log_dir # Run cellranger-atac mkfastq try: return run_cellranger_mkfastq( sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, output_dir=output_dir, lanes=(None if lanes is None else ','.join([str(l) for l in lanes])), bases_mask=bases_mask, cellranger_exe=cellranger_atac, cellranger_jobmode=cellranger_jobmode, cellranger_maxjobs=cellranger_maxjobs, cellranger_mempercore=cellranger_mempercore, cellranger_jobinterval=cellranger_jobinterval, cellranger_localcores=cellranger_localcores, cellranger_localmem=cellranger_localmem, working_dir=working_dir, log_dir=log_dir) except Exception as ex: raise Exception("'cellranger-atac mkfastq' failed: " "'%s'" % ex)
def make_fastqs(ap, protocol='standard', platform=None, unaligned_dir=None, sample_sheet=None, lanes=None, ignore_missing_bcl=False, ignore_missing_stats=False, skip_rsync=False, remove_primary_data=False, nprocessors=None, require_bcl2fastq_version=None, bases_mask=None, no_lane_splitting=None, minimum_trimmed_read_length=None, mask_short_adapter_reads=None, generate_stats=True, stats_file=None, per_lane_stats_file=None, analyse_barcodes=True, barcode_analysis_dir=None, skip_fastq_generation=False, only_fetch_primary_data=False, create_empty_fastqs=None, runner=None, cellranger_jobmode=None, cellranger_mempercore=None, cellranger_maxjobs=None, cellranger_jobinterval=None, cellranger_localcores=None, cellranger_localmem=None, cellranger_ignore_dual_index=False): """Create and summarise FASTQ files Wrapper for operations related to FASTQ file generation and analysis. The operations are typically: - get primary data (BCL files) - run bcl-to-fastq conversion - generate statistics If the number of processors and the job runner are not explicitly specified then these are taken from the settings for the bcl2fastq and the statistics generation steps, which may differ from each other. However if either of these values are set explicitly then the same values will be used for both steps. Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to create Fastqs for protocol (str): if set then specifies the protocol to use for fastq generation, otherwise use the 'standard' bcl2fastq protocol platform (str): if set then specifies the sequencing platform (otherwise platform will be determined from the primary data) unaligned_dir (str): if set then use this as the output directory for bcl-to-fastq conversion. Default is 'bcl2fastq' (unless an alternative is already specified in the config file) sample_sheet (str): if set then use this as the input samplesheet lanes (list): (optional) specify a list of lane numbers to use in the processing; lanes not in the list will be excluded (default is to include all lanes) nprocessors (int) : number of processors to run bclToFastq.py with ignore_missing_bcl (bool): if True then run bcl2fastq with --ignore-missing-bcl ignore_missing_stats (bool): if True then run bcl2fastq with --ignore-missing-stats skip_rsync (bool): if True then don't rsync primary data at the start of bcl2fastq conversion remove_primary_data (bool): if True then remove primary data at the end of bcl2fastq conversion (default is to keep it) generate_stats (bool): if True then (re)generate statistics file for fastqs analyse_barcodes (bool): if True then (re)analyse barcodes for fastqs require_bcl2fastq_version (str): (optional) specify bcl2fastq version to use. Should be a string of the form '1.8.4' or '>2.0'. Set to None to automatically determine required bcl2fastq version. bases_mask (str): if set then use this as an alternative bases mask setting no_lane_splitting (bool): if True then run bcl2fastq with --no-lane-splitting minimum_trimmed_read_length (int): if set then specify minimum length for reads after adapter trimming (shorter reads will be padded with Ns to make them long enough) mask_short_adapter_reads (int): if set then specify the minimum length of ACGT bases that must be present in a read after adapter trimming for it not to be masked completely with Ns. stats_file (str): if set then use this as the name of the output per-fastq stats file. per_lane_stats_file (str): if set then use this as the name of the output per-lane stats file. barcode_analysis_dir (str): if set then specifies path to the output directory for barcode analysis skip_fastq_generation (bool): if True then don't perform fastq generation only_fetch_primary_data (bool): if True then fetch primary data, don't do anything else create_empty_fastqs (bool): if True then create empty 'placeholder' fastq files for any missing fastqs after bcl2fastq (must have completed with zero exit status) runner (JobRunner): (optional) specify a non-default job runner to use for fastq generation cellranger_jobmode (str): (optional) job mode to run cellranger in (10xGenomics Chromium SC data only) cellranger_mempercore (int): (optional) memory assumed per core (in Gbs) (10xGenomics Chromium SC data only) cellranger_maxjobs (int): (optional) maxiumum number of concurrent jobs to run (10xGenomics Chromium SC data only) cellranger_jobinterval (int): (optional) how often jobs are submitted (in ms) (10xGenomics Chromium SC data only) cellranger_localcores (int): (optional) maximum number of cores cellranger can request in jobmode 'local' (10xGenomics Chromium SC data only) cellranger_localmem (int): (optional) maximum memory cellranger can request in jobmode 'local' (10xGenomics Chromium SC data only) cellranger_ignore_dual_index (bool): (optional) on a dual-indexed flowcell where the second index was not used for the 10x sample, ignore it (10xGenomics Chromium SC data only) """ # Report protocol print "Protocol : %s" % protocol if protocol not in MAKE_FASTQS_PROTOCOLS: raise Exception("Unknown protocol: '%s' (must be one of " "%s)" % (protocol, ','.join([MAKE_FASTQS_PROTOCOLS]))) # Unaligned dir if unaligned_dir is not None: ap.params['unaligned_dir'] = unaligned_dir elif ap.params['unaligned_dir'] is None: ap.params['unaligned_dir'] = 'bcl2fastq' print "Output dir : %s" % ap.params.unaligned_dir # Sample sheet if sample_sheet is None: sample_sheet = ap.params.sample_sheet if not os.path.isabs(sample_sheet): sample_sheet = os.path.join(ap.analysis_dir, sample_sheet) if not os.path.isfile(sample_sheet): raise Exception("Missing sample sheet '%s'" % sample_sheet) ap.params['sample_sheet'] = sample_sheet print "Source sample sheet : %s" % ap.params.sample_sheet # Check requested lanes are actually present print "Lanes : %s" % ('all' if lanes is None else ','.join( [str(l) for l in lanes])) if lanes is not None: s = IlluminaData.SampleSheet(ap.params.sample_sheet) if not s.has_lanes: raise Exception("Requested subset of lanes but " "samplesheet doesn't contain any " "lane information") samplesheet_lanes = list(set([l['Lane'] for l in s])) for l in lanes: if l not in samplesheet_lanes: raise Exception("Requested lane '%d' not present " "in samplesheet" % l) # Make a temporary sample sheet if lanes: lanes_id = ".L%s" % ''.join([str(l) for l in lanes]) else: lanes_id = "" sample_sheet = os.path.join( ap.tmp_dir, "SampleSheet%s.%s.csv" % (lanes_id, time.strftime("%Y%m%d%H%M%S"))) make_custom_sample_sheet(ap.params.sample_sheet, sample_sheet, lanes=lanes) # Check the temporary sample sheet print "Checking temporary sample sheet" invalid_barcodes = SampleSheetLinter( sample_sheet_file=sample_sheet).has_invalid_barcodes() if invalid_barcodes: logger.error("Invalid barcodes detected") for line in invalid_barcodes: logger.critical("%s" % line) invalid_characters = SampleSheetLinter( sample_sheet_file=sample_sheet).has_invalid_characters() if invalid_characters: logger.critical("Invalid non-printing/non-ASCII characters " "detected") if invalid_barcodes or invalid_characters: raise Exception("Errors detected in generated sample sheet") # Adjust verification settings for 10xGenomics Chromium SC # data if necessary verify_include_sample_dir = False if has_chromium_sc_indices(sample_sheet): if protocol in ( '10x_chromium_sc', '10x_chromium_sc_atac', ): # Force inclusion of sample-name subdirectories # when verifying Chromium SC data print "Sample sheet includes Chromium SC indices" verify_include_sample_dir = True else: # Chromium SC indices detected but not using # 10x_chromium_sc protocol raise Exception("Detected 10xGenomics Chromium SC indices " "in generated sample sheet but protocol " "'%s' has been specified; use an " "appropriate '10x_...' protocol for these " "indices" % protocol) # Check for pre-existing Fastq outputs if verify_fastq_generation(ap, unaligned_dir=ap.params.unaligned_dir, lanes=lanes, include_sample_dir=verify_include_sample_dir): print "Expected Fastq outputs already present" skip_rsync = True skip_fastq_generation = True # Check if there's anything to do if (skip_rsync and skip_fastq_generation) and \ not (generate_stats or analyse_barcodes): print "Nothing to do" return # Log dir log_dir = 'make_fastqs' if protocol != 'standard': log_dir += "_%s" % protocol if lanes: log_dir += "_L%s" % ''.join([str(l) for l in sorted(lanes)]) ap.set_log_dir(ap.get_log_subdir(log_dir)) # Fetch primary data if not skip_rsync and not ap.params.acquired_primary_data: if get_primary_data(ap) != 0: logger.error("Failed to acquire primary data") raise Exception("Failed to acquire primary data") else: ap.params['acquired_primary_data'] = True if only_fetch_primary_data: return # Deal with platform information if not platform: platform = ap.metadata.platform # Do fastq generation using the specified protocol if not skip_fastq_generation: # Set primary data location and report info primary_data_dir = os.path.join(ap.params.primary_data_dir, os.path.basename(ap.params.data_dir)) print "Primary data dir : %s" % primary_data_dir try: illumina_run = IlluminaData.IlluminaRun(primary_data_dir, platform=platform) except IlluminaData.IlluminaDataPlatformError as ex: logger.critical("Error loading primary data: %s" % ex) if platform is None: logger.critical("Try specifying platform using --platform?") else: logger.critical("Check specified platform is valid (or " "omit --platform") raise Exception("Error determining sequencer platform") print "Platform : %s" % illumina_run.platform print "Bcl format : %s" % illumina_run.bcl_extension # Set platform in metadata ap.metadata['platform'] = illumina_run.platform # Bases mask if bases_mask is not None: ap.params['bases_mask'] = bases_mask bases_mask = ap.params.bases_mask print "Bases mask setting : %s" % bases_mask if protocol not in ( '10x_chromium_sc', '10x_chromium_sc_atac', ): if bases_mask == "auto": print "Determining bases mask from RunInfo.xml" bases_mask = get_bases_mask(illumina_run.runinfo_xml, sample_sheet) if not bases_mask_is_valid(bases_mask): raise Exception("Invalid bases mask: '%s'" % bases_mask) # Do fastq generation according to protocol if protocol == 'icell8': # ICell8 data # Update bcl2fastq settings appropriately print "Updating read trimming and masking for ICell8" minimum_trimmed_read_length = 21 mask_short_adapter_reads = 0 # Reset the default bases mask bases_mask = IlluminaData.IlluminaRunInfo( illumina_run.runinfo_xml).bases_mask bases_mask = get_icell8_bases_mask(bases_mask, sample_sheet=sample_sheet) if not bases_mask_is_valid(bases_mask): raise Exception("Invalid bases mask: '%s'" % bases_mask) # Switch to standard protocol protocol = 'standard' if protocol == 'standard': # Standard protocol try: exit_code = bcl_to_fastq( ap, unaligned_dir=ap.params.unaligned_dir, sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, require_bcl2fastq=require_bcl2fastq_version, bases_mask=bases_mask, ignore_missing_bcl=ignore_missing_bcl, ignore_missing_stats=ignore_missing_stats, no_lane_splitting=no_lane_splitting, minimum_trimmed_read_length=minimum_trimmed_read_length, mask_short_adapter_reads=mask_short_adapter_reads, nprocessors=nprocessors, runner=runner) except Exception as ex: raise Exception("Bcl2fastq stage failed: '%s'" % ex) elif protocol == '10x_chromium_sc': # 10xGenomics Chromium SC if bases_mask == 'auto': bases_mask = None try: # Check we have cellranger cellranger = find_program('cellranger') if not cellranger: raise Exception("No cellranger package found") cellranger_software_info = cellranger_info(cellranger) print "Using cellranger %s: %s" % \ (cellranger_software_info[-1], cellranger) # Check we have bcl2fastq bcl2fastq = find_program('bcl2fastq') if not bcl2fastq: raise Exception("No bcl2fastq package found") bcl2fastq = available_bcl2fastq_versions( paths=(os.path.dirname(bcl2fastq), ), reqs='>=2.17') if not bcl2fastq: raise Exception("No appropriate bcl2fastq software " "located") bcl2fastq = bcl2fastq[0] bcl2fastq_info = bcl_to_fastq_info(bcl2fastq) print "Using bcl2fastq %s: %s" % (bcl2fastq_info[-1], bcl2fastq) # Store info on bcl2fastq package ap.metadata['bcl2fastq_software'] = bcl2fastq_info # Store info on cellranger package ap.metadata['cellranger_software'] = cellranger_software_info # Put a copy of sample sheet in the log directory shutil.copy(sample_sheet, ap.log_dir) # Determine output directory absolute path output_dir = ap.params.unaligned_dir if not os.path.isabs(output_dir): output_dir = os.path.join(ap.analysis_dir, output_dir) # Run cellranger mkfastq exit_code = run_cellranger_mkfastq( sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, output_dir=output_dir, lanes=(None if lanes is None else ','.join( [str(l) for l in lanes])), bases_mask=bases_mask, cellranger_exe=cellranger, cellranger_jobmode=cellranger_jobmode, cellranger_maxjobs=cellranger_maxjobs, cellranger_mempercore=cellranger_mempercore, cellranger_jobinterval=cellranger_jobinterval, cellranger_localcores=cellranger_localcores, cellranger_localmem=cellranger_localmem, working_dir=ap.analysis_dir, log_dir=ap.log_dir) except Exception as ex: raise Exception("'cellranger mkfastq' stage failed: " "'%s'" % ex) # Turn off barcode analysis analyse_barcodes = False elif protocol == '10x_chromium_sc_atac': # 10xGenomics Chromium scATAC-seq exit_code = bcl_to_fastq_10x_chromium_sc_atac( ap, output_dir=ap.params.unaligned_dir, sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, lanes=lanes, bases_mask=bases_mask, cellranger_jobmode=cellranger_jobmode, cellranger_maxjobs=cellranger_maxjobs, cellranger_mempercore=cellranger_mempercore, cellranger_jobinterval=cellranger_jobinterval, cellranger_localcores=cellranger_localcores, cellranger_localmem=cellranger_localmem, log_dir=ap.log_dir) # Turn off barcode analysis analyse_barcodes = False else: # Unknown protocol raise Exception("Unknown protocol '%s'" % protocol) # Check the outputs if exit_code != 0: raise Exception("Fastq generation finished with error: " "exit code %d" % exit_code) if not verify_fastq_generation( ap, lanes=lanes, include_sample_dir=verify_include_sample_dir): # Check failed logger.error("Failed to verify output Fastqs against " "sample sheet") # Try to load the data from unaligned dir try: illumina_data = IlluminaData.IlluminaData( ap.analysis_dir, unaligned_dir=ap.params.unaligned_dir) except IlluminaData.IlluminaDataError as ex: raise Exception("Unable to load data from %s: %s" % (ap.params.unaligned_dir, ex)) # Generate a list of missing Fastqs missing_fastqs = IlluminaData.list_missing_fastqs( illumina_data, sample_sheet, include_sample_dir=verify_include_sample_dir) assert (len(missing_fastqs) > 0) missing_fastqs_file = os.path.join(ap.log_dir, "missing_fastqs.log") print "Writing list of missing Fastq files to %s" % \ missing_fastqs_file with open(missing_fastqs_file, 'w') as fp: for fq in missing_fastqs: fp.write("%s\n" % fq) # Create empty FASTQs if create_empty_fastqs is None: try: create_empty_fastqs = \ ap.settings.platform[ap.metadata.platform].\ create_empty_fastqs except (KeyError, AttributeError): pass if create_empty_fastqs is None: create_empty_fastqs = \ ap.settings.bcl2fastq.create_empty_fastqs if create_empty_fastqs: logger.warning("Making 'empty' placeholder Fastqs") for fq in missing_fastqs: fastq = os.path.join(ap.analysis_dir, ap.params.unaligned_dir, fq) print "-- %s" % fastq if not os.path.exists(os.path.dirname(fastq)): mkdirs(os.path.dirname(fastq)) with gzip.GzipFile(filename=fastq, mode='wb') as fp: fp.write('') else: raise Exception("Fastq generation failed to produce " "expected outputs") # Generate statistics if generate_stats: fastq_statistics(ap, stats_file=stats_file, per_lane_stats_file=per_lane_stats_file, unaligned_dir=ap.params.unaligned_dir, nprocessors=nprocessors, runner=runner) # Run barcode analysis if analyse_barcodes: # Determine output directory if barcode_analysis_dir is not None: ap.params['barcode_analysis_dir'] = barcode_analysis_dir elif ap.params.barcode_analysis_dir is None: ap.params['barcode_analysis_dir'] = 'barcode_analysis' barcode_analysis_dir = ap.params.barcode_analysis_dir if not os.path.isabs(barcode_analysis_dir): barcode_analysis_dir = os.path.join(ap.params.analysis_dir, barcode_analysis_dir) # Report title title = "Barcode analysis for %s" % ap.metadata.run_name # Log file log_file = os.path.join(ap.log_dir, "analyse_barcodes.log") # Set up runner if runner is None: runner = ap.settings.general.default_runner runner.set_log_dir(ap.log_dir) # Get scheduler parameters max_jobs = ap.settings.general.max_concurrent_jobs poll_interval = ap.settings.general.poll_interval # Create and run barcode analysis pipeline barcode_analysis = AnalyseBarcodes( os.path.join(ap.params.analysis_dir, ap.params.unaligned_dir)) barcode_analysis.run(barcode_analysis_dir, title=title, lanes=lanes, sample_sheet=sample_sheet, log_file=log_file, runner=runner, max_jobs=max_jobs, poll_interval=poll_interval, verbose=False) # Make a 'projects.info' metadata file if lanes: ap.update_project_metadata_file() else: ap.make_project_metadata_file() # Remove primary data if remove_primary_data: remove_primary_data(ap)