def find_samples(path, sample=None, pattern = "-bcbb-config.yaml$", only_failed=False, **kw): """Find bcbb config files in a path. :param path: path to search in :param sample: a specific sample, or a file consisting of -bcbb-config.yaml files :param pattern: pattern to search for :returns: list of file names """ def bcbb_yaml_filter(f): return re.search(pattern, f) != None flist = [] if sample: if os.path.exists(sample): with open(sample) as fh: samplelist = fh.readlines() flist = [x.rstrip() for x in samplelist if re.search(pattern, x)] if len(flist) == 0: flist = [os.path.join(path, x.rstrip()) for x in samplelist if len(x) > 1] # Make sure there actually is a config file in path flist = list(chain.from_iterable([filtered_walk(x, bcbb_yaml_filter, exclude_dirs=kw.get("exclude_dirs", None), include_dirs=kw.get("include_dirs", None)) for x in flist])) if len(flist) == 0: return flist else: pattern = "{}{}".format(sample, pattern) if not flist: flist = filtered_walk(path, bcbb_yaml_filter, exclude_dirs=kw.get("exclude_dirs", None), include_dirs=kw.get("include_dirs", None)) if only_failed: status = {x:_sample_status(x) for x in flist} flist = [x for x in flist if _sample_status(x)=="FAIL"] if len(flist) == 0 and sample: LOG.info("No such sample {}".format(sample)) return [os.path.abspath(f) for f in flist]
def purge_alignments(self): """Cleanup sam and bam files. In some cases, sam files persist. If the corresponding bam file exists, replace the sam file contents with a message that the file has been removed to save space. """ pattern = ".sam$" def purge_filter(f): if not pattern: return return re.search(pattern, f) != None flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), purge_filter) if len(flist) == 0: self.app.log.info("No sam files found") return if len(flist) > 0 and not query_yes_no("Going to remove/cleanup {} sam files ({}...). Are you sure you want to continue?".format(len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force): return for f in flist: self.app.log.info("Purging sam file {}".format(f)) self.app.cmd.safe_unlink(f) if os.path.exists(f.replace(".sam", ".bam")): self.app.cmd.write(f, "File removed to save disk space: SAM converted to BAM") ## Find bam files in alignments subfolders pattern = ".bam$" flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), purge_filter, include_dirs=["alignments"]) for f in flist: f_tgt = [f.replace(".bam", "-sort.bam"), os.path.join(os.path.dirname(os.path.dirname(f)),os.path.basename(f) )] for tgt in f_tgt: if os.path.exists(tgt): self.app.log.info("Purging bam file {}".format(f)) self.app.cmd.safe_unlink(f) self.app.cmd.write(f, "File removed to save disk space: Moved to {}".format(os.path.abspath(tgt)))
def remove_finished(self): if not self._check_pargs(["project"]): return # Don't filter out files def filter_fn(f): return True slist = os.listdir(os.path.join(self._meta.root_path, self._meta.path_id)) for s in slist: spath = os.path.join(self._meta.root_path, self._meta.path_id, s) if not os.path.isdir(spath): continue if not os.path.exists(os.path.join(spath, FINISHED_FILE)): self.app.log.info("Sample {} not finished; skipping".format(s)) continue flist = filtered_walk(spath, filter_fn) dlist = filtered_walk(spath, filter_fn, get_dirs=True) if os.path.exists(os.path.join(spath, REMOVED_FILE)): self.app.log.info("Sample {} already removed; skipping".format(s)) continue if len(flist) > 0 and not query_yes_no("Will remove directory {} containing {} files; continue?".format(s, len(flist)), force=self.pargs.force): continue self.app.log.info("Removing {} files from {}".format(len(flist), spath)) for f in flist: if f == os.path.join(spath, FINISHED_FILE): continue self.app.cmd.safe_unlink(f) self.app.log.info("Removing {} directories from {}".format(len(dlist), spath)) for d in sorted(dlist, reverse=True): self.app.cmd.safe_rmdir(d) if not self.pargs.dry_run: with open(os.path.join(spath, REMOVED_FILE), "w") as fh: t_utc = utc_time() fh.write(t_utc)
def remove_files(f, **kw): ## Remove old files if requested keep_files = [ "-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$", "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$", "_[0-9]+_fastq.txt.gz$", "_[0-9]+_fastq.txt$", "^[0-9][0-9]_.*.txt$", "JOBID", "PID" ] pattern = "|".join(keep_files) def remove_filter_fn(f): return re.search(pattern, f) == None workdir = os.path.dirname(f) remove_files = filtered_walk(workdir, remove_filter_fn) remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True) if len(remove_files) == 0: pass if len(remove_files) > 0 and query_yes_no( "Going to remove {} files and {} directories... Are you sure you want to continue?" .format(len(remove_files), len(remove_dirs)), force=kw['force']): [dry_unlink(x, dry_run=kw['dry_run']) for x in remove_files] ## Sort directories by length so we don't accidentally try to remove a non-empty dir [ dry_rmdir(x, dry_run=kw['dry_run']) for x in sorted(remove_dirs, key=lambda x: len(x), reverse=True) ]
def test_filtered_walk_get_dirs(self): """Perform a filtered walk of data dir, getting dirs""" flist = filtered_walk("data", filter_fn=self.filter_fn, include_dirs=["nophix"], exclude_dirs=["fastqc"], get_dirs=True) self.assertEqual(set(flist), set([])) flist = filtered_walk("data", filter_fn=self.filter_fn, include_dirs=["nophix"], exclude_dirs=["fastqc"], get_dirs=False) self.assertEqual(set(flist), set(['data/nophix/file1.txt']))
def hs_metrics(self): if not self._check_pargs(["project", "region_file"]): return if not self.pargs.bait_file: self.pargs.bait_file = self.pargs.region_file self.log.info("hs_metrics: This is a temporary solution for calculating hs metrics for samples using picard tools") pattern = "{}.bam$".format(self.pargs.hs_file_type) def filter_fn(f): return re.search(pattern, f) != None ### FIX ME: this isn't caught by _process_args path = self.pargs.flowcell if self.pargs.flowcell else self.pargs.project flist = filtered_walk(os.path.join(self.config.get("production", "root"), path), filter_fn=filter_fn, exclude_dirs=['nophix', 'alignments', 'fastqc', 'fastq_screen']) if self.pargs.input_file: flist = [os.path.abspath(self.pargs.input_file)] if not query_yes_no("Going to run hs_metrics on {} files. Are you sure you want to continue?".format(len(flist)), force=self.pargs.force): return for f in flist: self.log.info("running CalculateHsMetrics on {}".format(f)) ### Issue with calling java from ### subprocess:http://stackoverflow.com/questions/9795249/issues-with-wrapping-java-program-with-pythons-subprocess-module ### Actually not an issue: command line arguments have to be done the right way cl = ["java"] + ["-{}".format(self.pargs.java_opts)] + ["-jar", "{}/CalculateHsMetrics.jar".format(os.getenv("PICARD_HOME"))] + ["INPUT={}".format(f)] + ["TARGET_INTERVALS={}".format(os.path.abspath(self.pargs.region_file))] + ["BAIT_INTERVALS={}".format(os.path.abspath(self.pargs.bait_file))] + ["OUTPUT={}".format(f.replace(".bam", ".hs_metrics"))] + ["VALIDATION_STRINGENCY=SILENT"] out = self.app.cmd.command(cl) if out: self.app._output_data["stdout"].write(out.rstrip())
def best_practice(self): if not self._check_pargs(["project", "uppmax_project"]): return project_path = os.path.normpath( os.path.join("/proj", self.pargs.uppmax_project)) if not os.path.exists(project_path): self.log.warn("No such project {}; skipping".format( self.pargs.uppmax_project)) return if self.pargs.outdir: outpath = os.path.join(project_path, "INBOX", self.pargs.outdir) else: outpath = os.path.join( project_path, "INBOX", self.pargs.statusdb_project_name ) if self.pargs.statusdb_project_name else os.path.join( project_path, "INBOX", self.pargs.project) if not query_yes_no( "Going to deliver data to {}; continue?".format(outpath)): return if not os.path.exists(outpath): self.app.cmd.safe_makedir(outpath) kw = vars(self.pargs) basedir = os.path.abspath( os.path.join(self._meta.root_path, self._meta.path_id)) flist = find_samples(basedir, **vars(self.pargs)) if not len(flist) > 0: self.log.info("No samples/sample configuration files found") return def filter_fn(f): if not pattern: return return re.search(pattern, f) != None # Setup pattern plist = [".*.yaml$", ".*.metrics$"] if not self.pargs.no_bam: plist.append(".*-{}.bam$".format(self.pargs.bam_file_type)) plist.append(".*-{}.bam.bai$".format(self.pargs.bam_file_type)) if not self.pargs.no_vcf: plist.append(".*.vcf$") plist.append(".*.vcf.gz$") plist.append(".*.tbi$") plist.append(".*.tsv$") pattern = "|".join(plist) size = 0 for f in flist: path = os.path.dirname(f) sources = filtered_walk(path, filter_fn=filter_fn, exclude_dirs=BCBIO_EXCLUDE_DIRS) targets = [src.replace(basedir, outpath) for src in sources] self._transfer_files(sources, targets) if self.pargs.size: statinfo = [os.stat(src).st_size for src in sources] size = size + sum(statinfo) if self.pargs.size: self.app._output_data['stderr'].write( "\n********************************\nEstimated delivery size: {:.1f}G\n********************************" .format(size / 1e9))
def get_file_copy_list(proj_base_dir, dest_proj_path, fcid, deliver_all_fcs, deliver_nophix, skip_list): to_copy = [] for fqfile in filtered_walk( proj_base_dir, is_fastq, include_dirs=[fcid] if not deliver_all_fcs else None, exclude_dirs=skip_list ): # Get the run_name and sample_name from the path sample_name, run_name, _ = os.path.relpath(fqfile, proj_base_dir).split(os.sep, 2) date, fc_id = run_name.split("_") # Skip if we deliver from nophix and the parent dir is not nophix (or vice versa) pdir = os.path.basename(os.path.dirname(fqfile)) if deliver_nophix and pdir != "nophix": continue if not deliver_nophix and pdir != run_name: continue # Skip if a compressed version of the current file exists if os.path.exists("{:s}.gz".format(fqfile)): print ( "WARNING: Both compressed and non-compressed versions of {:s} exists! " "Is compression/decompression in progress? Will deliver compressed version " "but you should make sure that the delivered files are complete!".format(fqfile) ) continue print ("DEBUG: source_delivery_path = {:s}".format(os.path.dirname(fqfile))) fname = os.path.basename(fqfile) print (fname) dest_run_path = os.path.join(dest_proj_path, sample_name, run_name) dest_file_name = create_final_name(fname, date, fc_id, sample_name) to_copy.append([fqfile, dest_run_path, dest_file_name]) return to_copy
def test_filtered_walk_include_exclude(self): """Perform a filtered walk of data dir, using include_dirs and exclude_dirs restriction""" flist = filtered_walk("data", filter_fn=self.filter_fn, include_dirs=["nophix"], exclude_dirs=["fastqc"]) self.assertEqual(set(flist), set(['data/nophix/file1.txt']))
def hs_metrics(self): if not self._check_pargs(["project", "targets"]): return if not self.pargs.baits: self.pargs.baits = self.pargs.targets self.log.info("hs_metrics: This is a temporary solution for calculating hs metrics for samples using picard tools") pattern = "{}.bam$".format(self.pargs.hs_file_type) def filter_fn(f): return re.search(pattern, f) != None ### FIX ME: this isn't caught by _process_args flist = [] path = self.pargs.flowcell if self.pargs.flowcell else self.pargs.project basedir = os.path.abspath(os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id)) samples = find_samples(basedir, **vars(self.pargs)) inc_dirs = [os.path.dirname(x) for x in samples] flist = filtered_walk(os.path.join(self.config.get(self.app.controller._meta.label, "root"), path), filter_fn=filter_fn, exclude_dirs=['nophix', 'alignments', 'fastqc', 'fastq_screen'], include_dirs=inc_dirs) if not query_yes_no("Going to run hs_metrics on {} files. Are you sure you want to continue?".format(len(flist)), force=self.pargs.force): return for f in flist: self.log.info("running CalculateHsMetrics on {}".format(f)) ### Issue with calling java from ### subprocess:http://stackoverflow.com/questions/9795249/issues-with-wrapping-java-program-with-pythons-subprocess-module ### Actually not an issue: command line arguments have to be done the right way cl = ["java"] + ["-{}".format(self.pargs.java_opts)] + ["-jar", "{}/CalculateHsMetrics.jar".format(os.getenv("PICARD_HOME"))] + ["INPUT={}".format(f)] + ["TARGET_INTERVALS={}".format(os.path.abspath(self.pargs.targets))] + ["BAIT_INTERVALS={}".format(os.path.abspath(self.pargs.baits))] + ["OUTPUT={}".format(f.replace(".bam", ".hs_metrics"))] + ["VALIDATION_STRINGENCY=SILENT"] out = self.app.cmd.command(cl) if out: self.app._output_data["stdout"].write(out.rstrip())
def remove_files(f, **kw): ## Remove old files if requested keep_files = ["-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$", "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$", "_[0-9]+_fastq.txt.gz$", "_[0-9]+_fastq.txt$", "^[0-9][0-9]_.*.txt$", "JOBID", "PID"] pattern = "|".join(keep_files) def remove_filter_fn(f): return re.search(pattern, f) == None workdir = os.path.dirname(f) remove_files = filtered_walk(workdir, remove_filter_fn) remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True) if len(remove_files) == 0: pass if len(remove_files) > 0 and query_yes_no("Going to remove {} files and {} directories... Are you sure you want to continue?".format(len(remove_files), len(remove_dirs)), force=kw['force']): [dry_unlink(x, dry_run=kw['dry_run']) for x in remove_files] ## Sort directories by length so we don't accidentally try to remove a non-empty dir [dry_rmdir(x, dry_run=kw['dry_run']) for x in sorted(remove_dirs, key=lambda x: len(x), reverse=True)]
def _to_casava_structure(self, fc): transfer_status = {} outdir_pfx = os.path.abspath(os.path.join(self.app.config.get("project", "root"), self.pargs.project, "data")) if self.pargs.transfer_dir: outdir_pfx = os.path.abspath( os.path.join(self.app.config.get("project", "root"), self.pargs.transfer_dir, "data") ) for sample in fc: key = "{}_{}".format(sample["lane"], sample["sequence"]) sources = {"files": self._prune_sequence_files(sample["files"]), "results": sample["results"]} outdir = os.path.join(outdir_pfx, sample["name"], fc.fc_id()) dirs = { "data": os.path.abspath(os.path.join(outdir_pfx, sample["name"], fc.fc_id())), "intermediate": os.path.abspath(os.path.join(outdir_pfx, sample["name"], fc.fc_id())), } self._make_output_dirs(dirs) fc_new = fc.subset("lane", sample["lane"]).subset("name", sample["name"]) targets = { "files": [src.replace(fc.path, dirs["data"]) for src in sources["files"]], "results": [src.replace(fc.path, dirs["intermediate"]) for src in sources["results"]], } fc_new.lane_files = dict( (k, [os.path.join(outdir, os.path.basename(x)) for x in v]) for k, v in fc_new.lane_files.items() ) fc_new.set_entry(key, "files", targets["files"]) fc_new.set_entry(key, "results", targets["results"]) ## Copy sample files - currently not doing lane files self._transfer_files(sources, targets) self.app.cmd.write( os.path.join(dirs["data"], "{}-bcbb-pm-config.yaml".format(sample["name"])), fc_new.as_yaml() ) transfer_status[sample["name"]] = {"files": len(sources["files"]), "results": len(sources["results"])} ## Rewrite platform_args; only keep time, workdir, account, partition, outpath and jobname pattern = "-post_process.yaml$" def pp_yaml_filter(f): return re.search(pattern, f) != None ppfiles = filtered_walk(dirs["data"], pp_yaml_filter) for pp in ppfiles: self.app.log.debug("Rewriting platform args for {}".format(pp)) with open(pp, "r") as fh: conf = yaml.load(fh) if not conf: self.app.log.warn("No configuration for {}".format(pp)) continue newconf = prune_pp_platform_args(conf) if newconf == conf: continue self.app.cmd.safe_unlink(pp) self.app.cmd.write(pp, yaml.safe_dump(newconf, default_flow_style=False, allow_unicode=True, width=1000)) # Write transfer summary self.app._output_data["stderr"].write("Transfer summary\n") self.app._output_data["stderr"].write("{:<18}{:>18}{:>18}\n".format("Sample", "Transferred files", "Results")) for k, v in transfer_status.iteritems(): self.app._output_data["stderr"].write("{:<18}{:>18}{:>18}\n".format(k, v["files"], v["results"]))
def test_filtered_walk(self): """Perform a filtered walk of data dir""" flist = filtered_walk("data", filter_fn=self.filter_fn) self.assertEqual( set(flist), set([ 'data/file1.txt', 'data/alignments/file1.txt', 'data/nophix/file1.txt', 'data/nophix/fastqc/file1.txt', 'data/fastqc/file1.txt', 'data/fastqc/nophix/file1.txt' ]))
def ls(self): if self._meta.path_id == "": self._ls(self._meta.root_path, filter_output=True) else: if self._meta.file_ext: pattern = "|".join(["{}$".format(x) for x in self._meta.file_ext]) flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), file_filter) if flist: self.app._output_data["stdout"].write("\n".join(flist)) else: self._ls(os.path.join(self._meta.root_path, self._meta.path_id))
def test_remove_files(self): """Test removing files""" keep_files = ["-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$", "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$", "^[0-9][0-9]_.*.txt$"] pattern = "|".join(keep_files) def remove_filter_fn(f): return re.search(pattern, f) == None flist = find_samples(j_doe_00_05) for f in flist: workdir = os.path.dirname(f) remove_files = filtered_walk(workdir, remove_filter_fn) self.assertNotIn("01_analysis_start.txt", [os.path.basename(x) for x in remove_files])
def test_remove_dirs(self): """Test removing directories before rerunning pipeline""" keep_files = ["-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$", "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$"] pattern = "|".join(keep_files) def remove_filter_fn(f): return re.search(pattern, f) == None flist = find_samples(j_doe_00_05) for f in flist: workdir = os.path.dirname(f) remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True) self.assertIn("fastqc", [os.path.basename(x) for x in remove_dirs])
def test_filtered_walk_exclude(self): """Perform a filtered walk of data dir, using exclude_dirs restriction""" flist = filtered_walk("data", filter_fn=self.filter_fn, exclude_dirs=["nophix"]) self.assertEqual( set(flist), set([ 'data/file1.txt', 'data/alignments/file1.txt', 'data/fastqc/file1.txt' ]))
def remove_finished(self): if not self._check_pargs(["project"]): return # Don't filter out files def filter_fn(f): return True slist = os.listdir( os.path.join(self._meta.root_path, self._meta.path_id)) for s in slist: spath = os.path.join(self._meta.root_path, self._meta.path_id, s) if not os.path.isdir(spath): continue if not os.path.exists(os.path.join(spath, FINISHED_FILE)): self.app.log.info("Sample {} not finished; skipping".format(s)) continue flist = filtered_walk(spath, filter_fn) dlist = filtered_walk(spath, filter_fn, get_dirs=True) if os.path.exists(os.path.join(spath, REMOVED_FILE)): self.app.log.info( "Sample {} already removed; skipping".format(s)) continue if len(flist) > 0 and not query_yes_no( "Will remove directory {} containing {} files; continue?". format(s, len(flist)), force=self.pargs.force): continue self.app.log.info("Removing {} files from {}".format( len(flist), spath)) for f in flist: if f == os.path.join(spath, FINISHED_FILE): continue self.app.cmd.safe_unlink(f) self.app.log.info("Removing {} directories from {}".format( len(dlist), spath)) for d in sorted(dlist, reverse=True): self.app.cmd.safe_rmdir(d) if not self.pargs.dry_run: with open(os.path.join(spath, REMOVED_FILE), "w") as fh: t_utc = utc_time() fh.write(t_utc)
def test_filtered_walk_include(self): """Perform a filtered walk of data dir, using include_dirs restriction""" self.pattern = "file2.txt" flist = filtered_walk("data", filter_fn=self.filter_fn, include_dirs=["nophix"]) self.assertEqual( set(flist), set([ 'data/nophix/file2.txt', 'data/nophix/fastqc/file2.txt', 'data/fastqc/nophix/file2.txt' ]))
def test_casava_transfer(self): """Test transfer of casava data from production to project""" self.app = self.make_app(argv = ['production', 'transfer', 'J.Doe_00_03', '--debug', '--force', '--quiet'], extensions=[]) handler.register(ProductionController) self._run_app() os.chdir(filedir) j_doe_00_03 = os.path.abspath(os.path.join(filedir, "data", "projects", "j_doe_00_03")) pattern = ".fastq(.gz)?$" def fastq_filter(f): return re.search(pattern, f) != None fastq_files = filtered_walk(j_doe_00_03, fastq_filter) self.assertEqual(len(fastq_files), 2)
def setUpClass(cls): if not os.getcwd() == filedir: os.chdir(filedir) LOG.info("Copy tree {} to {}".format(j_doe_00_01, j_doe_00_04)) if not os.path.exists(j_doe_00_04): shutil.copytree(j_doe_00_01, j_doe_00_04) pattern = "-bcbb-config.yaml$" def yaml_filter(f): return re.search(pattern, f) != None yaml_files = filtered_walk(j_doe_00_04, yaml_filter) with open(SAMPLEFILE, "w") as fh: fh.write("\n".join(yaml_files[0:1]))
def clean(self): if not self._check_pargs(["project"]): return self._meta.pattern = "|".join(["{}(.gz|.bz2)?$".format(x) for x in self._meta.file_ext]) flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), self._filter_fn, include_dirs=self._meta.include_dirs) if len(flist) == 0: self.app.log.info("No files matching pattern '{}' found".format(self._meta.pattern)) return if len(flist) > 0 and not query_yes_no("Going to remove {} files ({}...). Are you sure you want to continue?".format(len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force): return for f in flist: self.app.log.info("removing {}".format(f)) self.app.cmd.safe_unlink(f)
def _compress(self, label="compress"): if self.pargs.input_file: flist = [self.pargs.input_file] else: flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), self._filter_fn) if len(flist) == 0: self.app.log.info("No files matching pattern '{}' found".format(self._meta.pattern)) return if len(flist) > 0 and not query_yes_no("Going to {} {} files ({}...). Are you sure you want to continue?".format(label, len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force): sys.exit() for f in flist: self.log.info("{}ing {}".format(label, f)) self.app.cmd.command([self._meta.compress_prog, self._meta.compress_opt, "%s" % f], label, ignore_error=True, **{'workingDirectory':os.path.dirname(f), 'outputPath':os.path.join(os.path.dirname(f), "{}-{}-drmaa.log".format(label, os.path.basename(f)))})
def best_practice(self): if not self._check_pargs(["project", "uppmax_project"]): return project_path = os.path.normpath(os.path.join("/proj", self.pargs.uppmax_project)) if not os.path.exists(project_path): self.log.warn("No such project {}; skipping".format(self.pargs.uppmax_project)) return if self.pargs.outdir: outpath = os.path.join(project_path, "INBOX", self.pargs.outdir) else: outpath = os.path.join(project_path, "INBOX", self.pargs.statusdb_project_name) if self.pargs.statusdb_project_name else os.path.join(project_path, "INBOX", self.pargs.project) if not query_yes_no("Going to deliver data to {}; continue?".format(outpath)): return if not os.path.exists(outpath): self.app.cmd.safe_makedir(outpath) kw = vars(self.pargs) basedir = os.path.abspath(os.path.join(self._meta.root_path, self._meta.path_id)) flist = find_samples(basedir, **vars(self.pargs)) if self.pargs.flowcell: flist = [ fl for fl in flist if os.path.basename(os.path.dirname(fl)) == self.pargs.flowcell ] if not len(flist) > 0: self.log.info("No samples/sample configuration files found") return def filter_fn(f): if not pattern: return return re.search(pattern, f) != None # Setup pattern plist = [".*.yaml$", ".*.metrics$"] if not self.pargs.no_bam: plist.append(".*-{}.bam$".format(self.pargs.bam_file_type)) plist.append(".*-{}.bam.bai$".format(self.pargs.bam_file_type)) if not self.pargs.no_vcf: plist.append(".*.vcf$") plist.append(".*.vcf.gz$") plist.append(".*.tbi$") plist.append(".*.tsv$") pattern = "|".join(plist) size = 0 for f in flist: path = os.path.dirname(f) sources = filtered_walk(path, filter_fn=filter_fn, exclude_dirs=BCBIO_EXCLUDE_DIRS) targets = [src.replace(basedir, outpath) for src in sources] self._transfer_files(sources, targets) if self.pargs.size: statinfo = [os.stat(src).st_size for src in sources] size = size + sum(statinfo) if self.pargs.size: self.app._output_data['stderr'].write("\n********************************\nEstimated delivery size: {:.1f}G\n********************************".format(size/1e9))
def _from_casava_structure(self): """Get information from casava structure""" if not self._check_pargs(["project"]): return fc_list = [] pattern = "-bcbb-config.yaml$" def bcbb_yaml_filter(f): return re.search(pattern, f) != None samples = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), bcbb_yaml_filter) for s in samples: fc = Flowcell(s) fc_new = fc.subset("sample_prj", self.pargs.project) fc_new.collect_files(os.path.dirname(s)) fc_list.append(fc_new) return fc_list
def clean(self): pattern = "|".join(["{}(.gz|.bz2)?$".format(x) for x in self._meta.file_pat]) def clean_filter(f): if not pattern: return return re.search(pattern , f) != None flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), clean_filter, include_dirs=self._meta.include_dirs) if len(flist) == 0: self.app.log.info("No files matching pattern {} found".format(pattern)) return if len(flist) > 0 and not query_yes_no("Going to remove {} files ({}...). Are you sure you want to continue?".format(len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force): return for f in flist: self.app.log.info("removing {}".format(f)) self.app.cmd.safe_unlink(f)
def get_report_copy_list(proj_name, reportpath, dest_proj_path, sample_copy_list): to_copy=[] fcid = get_run_info(sample_copy_list) pdf_list=filtered_walk(reportpath,is_pdf) project_report_name = proj_name+'_project_summary.pdf' for report in pdf_list: if report.split('/')[-1] == project_report_name: to_copy.append([report, dest_proj_path, project_report_name]) for flowcell in fcid: sample_report_name = proj_name+'_' + flowcell + '_sample_summary.pdf' if report.split('/')[-1] == sample_report_name: to_copy.append([report, dest_proj_path, sample_report_name]) return sample_copy_list+to_copy
def get_report_copy_list(proj_name, reportpath, dest_proj_path, sample_copy_list): to_copy = [] fcid = get_run_info(sample_copy_list) pdf_list = filtered_walk(reportpath, is_pdf) project_report_name = proj_name + '_project_summary.pdf' for report in pdf_list: if report.split('/')[-1] == project_report_name: to_copy.append([report, dest_proj_path, project_report_name]) for flowcell in fcid: sample_report_name = proj_name + '_' + flowcell + '_sample_summary.pdf' if report.split('/')[-1] == sample_report_name: to_copy.append([report, dest_proj_path, sample_report_name]) return sample_copy_list + to_copy
def run(self): if not self._check_pargs(["project", "post_process", "analysis_type"]): return ## Gather sample yaml files pattern = "-bcbb-config.yaml$" flist = [] if self.pargs.sample: if os.path.exists(self.pargs.sample): with open(self.pargs.sample) as fh: flist = [x.rstrip() for x in fh.readlines()] else: pattern = "{}{}".format(self.pargs.sample, pattern) def bcbb_yaml_filter(f): return re.search(pattern, f) != None if not flist: flist = filtered_walk(os.path.join(self.app.controller._meta.project_root, self.pargs.project, "data"), bcbb_yaml_filter) if self.pargs.only_failed: status = {x:self._sample_status(x) for x in flist} flist = [x for x in flist if self._sample_status(x)=="FAIL"] if len(flist) == 0 and self.pargs.sample: self.app.log.info("No such sample {}".format(self.pargs.sample)) if len(flist) > 0 and not query_yes_no("Going to start {} jobs... Are you sure you want to continue?".format(len(flist)), force=self.pargs.force): return for f in flist: with open(f) as fh: config = yaml.load(fh) if self.pargs.analysis_type: config["details"][0]["multiplex"][0]["analysis"] = self.pargs.analysis_type config["details"][0]["analysis"] = self.pargs.analysis_type if config["details"][0]["genome_build"] == 'unknown': config["details"][0]["genome_build"] = self.pargs.genome_build ## Check if files exist: if they don't, then change the suffix config["details"][0]["multiplex"][0]["files"].sort() if not os.path.exists(config["details"][0]["multiplex"][0]["files"][0]): if os.path.splitext(config["details"][0]["multiplex"][0]["files"][0])[1] == ".gz": config["details"][0]["multiplex"][0]["files"] = [x.replace(".gz", "") for x in config["details"][0]["multiplex"][0]["files"]] else: config["details"][0]["multiplex"][0]["files"] = ["{}.gz".format(x) for x in config["details"][0]["multiplex"][0]["files"]] config_file = f.replace("-bcbb-config.yaml", "-pm-bcbb-analysis-config.yaml") self.app.cmd.write(config_file, yaml.dump(config)) ## Run automated_initial_analysis.py cur_dir = os.getcwd() new_dir = os.path.abspath(os.path.dirname(f)) os.chdir(new_dir) self.app.cmd.command(['automated_initial_analysis.py', os.path.abspath(self.pargs.post_process), new_dir, config_file]) os.chdir(cur_dir)
def collect_files(self, path, project=None): """Collect files for a given project. FIXME: does not work entirely for casava-like folder structure""" if project: fc = self.subset("sample_prj", project) else: fc = self pattern = "|".join(fc.glob_pfx_str()) def file_filter(f): if not pattern: return return re.search(pattern, f) != None flist = filtered_walk(path, file_filter) for f in flist: self.classify_file(f) fc.path = path return fc
def test_casava_transfer(self): """Test transfer of casava data from production to project""" self.app = self.make_app(argv=[ 'production', 'transfer', 'J.Doe_00_03', '--debug', '--force', '--quiet' ], extensions=[]) handler.register(ProductionController) self._run_app() os.chdir(filedir) j_doe_00_03 = os.path.abspath( os.path.join(filedir, "data", "projects", "j_doe_00_03")) pattern = ".fastq(.gz)?$" def fastq_filter(f): return re.search(pattern, f) != None fastq_files = filtered_walk(j_doe_00_03, fastq_filter) self.assertEqual(len(fastq_files), 2)
def _compress(self, pattern, label="compress"): def compress_filter(f): if not pattern: return return re.search(pattern, f) != None if self.pargs.input_file: flist = [self.pargs.input_file] else: flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), compress_filter) if len(flist) == 0: self.app.log.info("No files matching pattern {} found".format(pattern)) return if len(flist) > 0 and not query_yes_no("Going to {} {} files ({}...). Are you sure you want to continue?".format(label, len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force): sys.exit() for f in flist: self.log.info("{}ing {}".format(label, f)) self.app.cmd.command([self._meta.compress_prog, self._meta.compress_opt, "%s" % f], label, ignore_error=True)
def test_remove_files(self): """Test removing files""" keep_files = [ "-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$", "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$", "^[0-9][0-9]_.*.txt$" ] pattern = "|".join(keep_files) def remove_filter_fn(f): return re.search(pattern, f) == None flist = find_samples(j_doe_00_05) for f in flist: workdir = os.path.dirname(f) remove_files = filtered_walk(workdir, remove_filter_fn) self.assertNotIn("01_analysis_start.txt", [os.path.basename(x) for x in remove_files])
def collect_files(self, path, project=None): """Collect files for a given project. :param path: path to search in """ if project: fc = self.subset("sample_prj", project) else: fc = self pattern = "|".join(fc.glob_pfx_str()) def file_filter(f): if not pattern: return return re.search(pattern, f) != None flist = filtered_walk(path, file_filter) for f in flist: self.classify_file(f) fc.path = path return fc
def test_remove_dirs(self): """Test removing directories before rerunning pipeline""" keep_files = [ "-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$", "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$" ] pattern = "|".join(keep_files) def remove_filter_fn(f): return re.search(pattern, f) == None flist = find_samples(j_doe_00_05) for f in flist: workdir = os.path.dirname(f) remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True) self.assertIn("fastqc", [os.path.basename(x) for x in remove_dirs])
def get_file_copy_list(proj_base_dir, dest_proj_path, fcid, deliver_all_fcs, deliver_nophix, skip_list): to_copy = [] for fqfile in filtered_walk( proj_base_dir, is_fastq, include_dirs=[fcid] if not deliver_all_fcs else None, exclude_dirs=skip_list): # Get the run_name and sample_name from the path sample_name, run_name, _ = os.path.relpath(fqfile, proj_base_dir).split( os.sep, 2) date, fc_id = run_name.split('_') # Skip if we deliver from nophix and the parent dir is not nophix (or vice versa) pdir = os.path.basename(os.path.dirname(fqfile)) if deliver_nophix and pdir != "nophix": continue if not deliver_nophix and pdir != run_name: continue # Skip if a compressed version of the current file exists if os.path.exists("{:s}.gz".format(fqfile)): print("WARNING: Both compressed and non-compressed versions of {:s} exists! " \ "Is compression/decompression in progress? Will deliver compressed version " \ "but you should make sure that the delivered files are complete!".format(fqfile)) continue print("DEBUG: source_delivery_path = {:s}".format( os.path.dirname(fqfile))) fname = os.path.basename(fqfile) print(fname) dest_run_path = os.path.join(dest_proj_path, sample_name, run_name) dest_file_name = create_final_name(fname, date, fc_id, sample_name) to_copy.append([fqfile, dest_run_path, dest_file_name]) return to_copy
def purge_alignments(path, ftype="sam", keep="last", dry_run=False, force=False, fsize=MINFILESIZE): """Cleanup sam and bam files. In some cases, sam files persist. If the corresponding bam file exists, replace the sam file contents with a message that the file has been removed to save space. In general, several bam files are produced in an analysis. By grouping bam files by prefix, either the most recent file is retained for further reference, or a specific analysis is kept. """ if ftype == "sam": pattern = ".sam$" elif ftype == "bam": pattern = ".bam$" else: LOG.warn("ftype must be one of 'sam' or 'bam'") return LOG.debug( "running purge_alignments in path {} with pattern {} keep rule {}". format(path, pattern, keep)) def purge_filter(f): if not pattern: return return re.search(pattern, f) != None flist = filtered_walk(path, purge_filter, exclude_dirs=["realign-split"]) if len(flist) == 0: LOG.info("No {} files found in {}".format(ftype, path)) return if len(flist) > 0 and not query_yes_no( "Going to remove/cleanup {} {} files ({}...). Are you sure you want to continue?" .format(len(flist), ftype, ",".join( [os.path.basename(x) for x in flist[0:10]])), force=force): return if ftype == "sam": for f in flist: LOG.info("Purging {} file {}".format(ftype, f)) dry_unlink(f, dry_run) if os.path.exists(f.replace(".sam", ".bam")): dry_write( f, "File removed to save disk space: SAM converted to BAM", dry_run) return elif ftype == "bam": samples = {} for f in flist: m = re.search("([0-9A-Za-z\_]+)-.*", os.path.basename(f)) if not m: LOG.debug("Couldn't determine prefix for {}".format(f)) continue sid = m.groups()[0] if not sid in samples.keys(): samples[sid] = {} dname = os.path.dirname(f) if not dname in samples[sid].keys(): samples[sid][dname] = [] samples[sid][dname].append(f) saved_size = 0 for k in samples.iterkeys(): for d, files in samples[k].iteritems(): if not files or len(files) == 1: continue files.sort(lambda x, y: cmp(len(x), len(y))) if keep == "last": LOG.info( "Keeping file {} and removing all files with common prefix: {}" .format( os.path.basename(files[len(files) - 1]), ", ".join( [os.path.basename(x) for x in files[0:-1]]))) saved_size = _purge_by_sample(files, dry_run, int(fsize)) + saved_size LOG.info("Will save approximately {:.1f}G space".format(saved_size / 1e9))
def test_filtered_walk_exclude(self): """Perform a filtered walk of data dir, using exclude_dirs restriction""" flist = filtered_walk("data", filter_fn=self.filter_fn, exclude_dirs=["nophix"]) self.assertEqual(set(flist), set(['data/file1.txt', 'data/alignments/file1.txt', 'data/fastqc/file1.txt']))
def test_filtered_walk_include(self): """Perform a filtered walk of data dir, using include_dirs restriction""" self.pattern = "file2.txt" flist = filtered_walk("data", filter_fn=self.filter_fn, include_dirs=["nophix"]) self.assertEqual(set(flist), set(['data/nophix/file2.txt', 'data/nophix/fastqc/file2.txt', 'data/fastqc/nophix/file2.txt']))
def test_filtered_walk(self): """Perform a filtered walk of data dir""" flist = filtered_walk("data", filter_fn=self.filter_fn) self.assertEqual(set(flist), set(['data/file1.txt', 'data/alignments/file1.txt', 'data/nophix/file1.txt', 'data/nophix/fastqc/file1.txt', 'data/fastqc/file1.txt', 'data/fastqc/nophix/file1.txt']))
def _to_casava_structure(self, fc): transfer_status = {} outdir_pfx = os.path.abspath( os.path.join(self.app.config.get("project", "root"), self.pargs.project, "data")) if self.pargs.transfer_dir: outdir_pfx = os.path.abspath( os.path.join(self.app.config.get("project", "root"), self.pargs.transfer_dir, "data")) for sample in fc: key = "{}_{}".format(sample['lane'], sample['sequence']) sources = { "files": self._prune_sequence_files(sample['files']), "results": sample['results'] } outdir = os.path.join(outdir_pfx, sample['name'], fc.fc_id()) dirs = { "data": os.path.abspath( os.path.join(outdir_pfx, sample['name'], fc.fc_id())), "intermediate": os.path.abspath( os.path.join(outdir_pfx, sample['name'], fc.fc_id())) } self._make_output_dirs(dirs) fc_new = fc.subset("lane", sample['lane']).subset("name", sample['name']) targets = { "files": [ src.replace(fc.path, dirs["data"]) for src in sources['files'] ], "results": [ src.replace(fc.path, dirs["intermediate"]) for src in sources['results'] ] } fc_new.lane_files = dict( (k, [os.path.join(outdir, os.path.basename(x)) for x in v]) for k, v in fc_new.lane_files.items()) fc_new.set_entry(key, 'files', targets['files']) fc_new.set_entry(key, 'results', targets['results']) ## Copy sample files - currently not doing lane files self._transfer_files(sources, targets) self.app.cmd.write( os.path.join(dirs["data"], "{}-bcbb-pm-config.yaml".format(sample['name'])), fc_new.as_yaml()) transfer_status[sample['name']] = { 'files': len(sources['files']), 'results': len(sources['results']) } ## Rewrite platform_args; only keep time, workdir, account, partition, outpath and jobname pattern = "-post_process.yaml$" def pp_yaml_filter(f): return re.search(pattern, f) != None ppfiles = filtered_walk(dirs["data"], pp_yaml_filter) for pp in ppfiles: self.app.log.debug("Rewriting platform args for {}".format(pp)) with open(pp, "r") as fh: conf = yaml.load(fh) if not conf: self.app.log.warn("No configuration for {}".format(pp)) continue newconf = prune_pp_platform_args(conf) if newconf == conf: continue self.app.cmd.safe_unlink(pp) self.app.cmd.write( pp, yaml.safe_dump(newconf, default_flow_style=False, allow_unicode=True, width=1000)) # Write transfer summary self.app._output_data["stderr"].write("Transfer summary\n") self.app._output_data["stderr"].write("{:<18}{:>18}{:>18}\n".format( "Sample", "Transferred files", "Results")) for k, v in transfer_status.iteritems(): self.app._output_data["stderr"].write( "{:<18}{:>18}{:>18}\n".format(k, v['files'], v['results']))
def purge_alignments(path, ftype="sam", keep="last", dry_run=False, force=False, fsize=MINFILESIZE): """Cleanup sam and bam files. In some cases, sam files persist. If the corresponding bam file exists, replace the sam file contents with a message that the file has been removed to save space. In general, several bam files are produced in an analysis. By grouping bam files by prefix, either the most recent file is retained for further reference, or a specific analysis is kept. """ if ftype == "sam": pattern = ".sam$" elif ftype == "bam": pattern = ".bam$" else: LOG.warn("ftype must be one of 'sam' or 'bam'") return LOG.debug("running purge_alignments in path {} with pattern {} keep rule {}".format(path, pattern, keep)) def purge_filter(f): if not pattern: return return re.search(pattern, f) != None flist = filtered_walk(path, purge_filter, exclude_dirs=["realign-split"]) if len(flist) == 0: LOG.info("No {} files found in {}".format(ftype, path)) return if len(flist) > 0 and not query_yes_no("Going to remove/cleanup {} {} files ({}...). Are you sure you want to continue?".format(len(flist), ftype, ",".join([os.path.basename(x) for x in flist[0:10]])), force=force): return if ftype == "sam": for f in flist: LOG.info("Purging {} file {}".format(ftype, f)) dry_unlink(f, dry_run) if os.path.exists(f.replace(".sam", ".bam")): dry_write(f, "File removed to save disk space: SAM converted to BAM", dry_run) return elif ftype == "bam": samples = {} for f in flist: m = re.search("([0-9A-Za-z\_]+)-.*", os.path.basename(f)) if not m: LOG.debug("Couldn't determine prefix for {}".format(f)) continue sid = m.groups()[0] if not sid in samples.keys(): samples[sid] = {} dname = os.path.dirname(f) if not dname in samples[sid].keys(): samples[sid][dname] = [] samples[sid][dname].append(f) saved_size = 0 for k in samples.iterkeys(): for d, files in samples[k].iteritems(): if not files or len(files) == 1: continue files.sort(lambda x,y: cmp(len(x), len(y))) if keep == "last": LOG.info("Keeping file {} and removing all files with common prefix: {}".format(os.path.basename(files[len(files)-1]), ", ".join([os.path.basename(x) for x in files[0:-1]]))) saved_size = _purge_by_sample(files, dry_run, int(fsize)) + saved_size LOG.info("Will save approximately {:.1f}G space".format(saved_size / 1e9))
def flowcell_remove_status(archive_dir, swestore_dir, to_remove="to_remove"): """This function looks for flowcells that could be deleted from archive and returns a list of flowcells with a KEEP/RM flag. The rules are 1. the flowcell is in archive to_remove file 2. pbzip ran without error 3. the tarball filesize looks ok 4. checksum irods is ok :param archive_dir: archive directory :param swestore_dir: base dir for swestore :param to_remove: to remove file name """ output_data = {'stdout':StringIO(), 'stderr':StringIO()} ## Check for ils try: proc = subprocess.Popen(["ils"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = proc.communicate() proc.wait() proc = subprocess.Popen(["icd", os.path.basename(os.path.dirname(archive_dir))], stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = proc.communicate() proc.wait() except: LOG.warn("No such command 'ils': please load the irods module" ) return output_data ## make flowcell dictionary based on to_remove contents to_remove_file = os.path.join(archive_dir, to_remove) with open(to_remove_file) as fh: remove_list = fh.readlines() flowcells = {k.replace("./", "").rstrip():{'in_archive':False, 'pbzip_exit':1, 'tarball_size':0, 'irods_checksum':1} for k in remove_list if k.rstrip() != ''} ## Look for compress logs pattern = "slurm.*.out$" def compress_fn(f): return re.search(pattern, f) != None compress_log_files = filtered_walk(os.path.join(archive_dir, "compress_logs"), compress_fn) for f in compress_log_files: with open(f) as fh: compress_str = "".join([x.strip() for x in fh.readlines()]) m = re.search("Compressing[ ]+([0-9A-Za-z_\-]+)\.\.\..*Exit code:[ ]+([0-9]+)", compress_str) if m: if not m.groups()[0] in flowcells.keys(): LOG.warn("flowcell {} present in to_remove but not in archive".format(m.groups()[0])) else: flowcells[m.groups()[0]]['pbzip_exit'] = m.groups()[1] else: LOG.warn("{}: no match for {}".format(f, compress_str)) ## Get tarball sizes and check if in archive ## Loop through flowcells and perform ichksum for k in flowcells.keys(): LOG.debug("Getting tarball size, archive presence and ichksum for {}".format(k)) fcdir = os.path.join(archive_dir, k) if os.path.exists(fcdir): flowcells[k]['in_archive'] = True fctar = os.path.join(swestore_dir, "drophere2archive", "{}.tar.bz2".format(k)) try: cl = ["ichksum", os.path.basename(fctar)] proc = subprocess.Popen(cl, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = proc.communicate() proc.wait() flowcells[k]['irods_checksum'] = stdout.split("\n")[1] except: LOG.warn("command {} failed".format(" ".join(cl))) if not os.path.exists(fctar): continue else: LOG.debug("tarball exists: {}".format(fctar)) statinfo = os.stat(fctar) flowcells[k]['tarball_size'] = float(int(statinfo.st_size) / 1e9) output_data["stdout"].write("\nFlowcell archive status\n") output_data["stdout"].write("=======================\n") output_data["stdout"].write("\nThe table lists those flowcells still present in archive. The exict code for pbzip should be 0\nfor success. A non-existing tarball has size 0.\n\n") output_data["stdout"].write("{:<40}{:>12}{:>20}{:>60}\n".format("Flowcell", "pbzip_exit", "tarball_size (G)", 'irods_checksum')) output_data["stdout"].write("{:<40}{:>12}{:>20}{:>60}\n".format("--------", "----------", "----------------", '--------------')) for k in sorted(flowcells.keys()): if not flowcells[k]['in_archive']: continue output_data["stdout"].write("{:<40}{:>12}{:>20.2f}{:>60}\n".format(k, flowcells[k]['pbzip_exit'], flowcells[k]['tarball_size'], flowcells[k]['irods_checksum'] )) return output_data
def flowcell_remove_status(archive_dir, swestore_dir, to_remove="to_remove"): """This function looks for flowcells that could be deleted from archive and returns a list of flowcells with a KEEP/RM flag. The rules are 1. the flowcell is in archive to_remove file 2. pbzip ran without error 3. the tarball filesize looks ok 4. checksum irods is ok :param archive_dir: archive directory :param swestore_dir: base dir for swestore :param to_remove: to remove file name """ output_data = {'stdout': StringIO(), 'stderr': StringIO()} ## Check for ils try: proc = subprocess.Popen(["ils"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = proc.communicate() proc.wait() proc = subprocess.Popen( ["icd", os.path.basename(os.path.dirname(archive_dir))], stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = proc.communicate() proc.wait() except: LOG.warn("No such command 'ils': please load the irods module") return output_data ## make flowcell dictionary based on to_remove contents to_remove_file = os.path.join(archive_dir, to_remove) with open(to_remove_file) as fh: remove_list = fh.readlines() flowcells = { k.replace("./", "").rstrip(): { 'in_archive': False, 'pbzip_exit': 1, 'tarball_size': 0, 'irods_checksum': 1 } for k in remove_list if k.rstrip() != '' } ## Look for compress logs pattern = "slurm.*.out$" def compress_fn(f): return re.search(pattern, f) != None compress_log_files = filtered_walk( os.path.join(archive_dir, "compress_logs"), compress_fn) for f in compress_log_files: with open(f) as fh: compress_str = "".join([x.strip() for x in fh.readlines()]) m = re.search( "Compressing[ ]+([0-9A-Za-z_\-]+)\.\.\..*Exit code:[ ]+([0-9]+)", compress_str) if m: if not m.groups()[0] in flowcells.keys(): LOG.warn("flowcell {} present in to_remove but not in archive". format(m.groups()[0])) else: flowcells[m.groups()[0]]['pbzip_exit'] = m.groups()[1] else: LOG.warn("{}: no match for {}".format(f, compress_str)) ## Get tarball sizes and check if in archive ## Loop through flowcells and perform ichksum for k in flowcells.keys(): LOG.debug( "Getting tarball size, archive presence and ichksum for {}".format( k)) fcdir = os.path.join(archive_dir, k) if os.path.exists(fcdir): flowcells[k]['in_archive'] = True fctar = os.path.join(swestore_dir, "drophere2archive", "{}.tar.bz2".format(k)) try: cl = ["ichksum", os.path.basename(fctar)] proc = subprocess.Popen(cl, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = proc.communicate() proc.wait() flowcells[k]['irods_checksum'] = stdout.split("\n")[1] except: LOG.warn("command {} failed".format(" ".join(cl))) if not os.path.exists(fctar): continue else: LOG.debug("tarball exists: {}".format(fctar)) statinfo = os.stat(fctar) flowcells[k]['tarball_size'] = float(int(statinfo.st_size) / 1e9) output_data["stdout"].write("\nFlowcell archive status\n") output_data["stdout"].write("=======================\n") output_data["stdout"].write( "\nThe table lists those flowcells still present in archive. The exict code for pbzip should be 0\nfor success. A non-existing tarball has size 0.\n\n" ) output_data["stdout"].write("{:<40}{:>12}{:>20}{:>60}\n".format( "Flowcell", "pbzip_exit", "tarball_size (G)", 'irods_checksum')) output_data["stdout"].write("{:<40}{:>12}{:>20}{:>60}\n".format( "--------", "----------", "----------------", '--------------')) for k in sorted(flowcells.keys()): if not flowcells[k]['in_archive']: continue output_data["stdout"].write("{:<40}{:>12}{:>20.2f}{:>60}\n".format( k, flowcells[k]['pbzip_exit'], flowcells[k]['tarball_size'], flowcells[k]['irods_checksum'])) return output_data
def setUpModule(): """Set up test files for scilifelab pipeline tests. The setup covers some typical situations, such as multiplexing, samples run on several flowcells, and same sample being run on several lanes in one flowcell. In short, the setup - downloads data from 1000 genomes (exome data from chr11, 0-2Mb) - generates fastq files in an archive folder - installs genome references (phix, hg19) - downloads dbsnp data for chr11, 0-2Mb - runs run_bcbb_pipeline.py -s to install fastq files to production folder - runs automated_initial_analysis.py """ pattern = "14_write_metrics.txt" def filter_fn(f): return re.search(pattern, f) != None n = sum([len(filtered_walk(os.path.join(PROJECTDIR, x), filter_fn)) for x in PROJECTS]) if n == NSAMPLES: LOG.info("All samples have been run, requirements for downstream tests satisfied") return LOG.info("Running setUpModule") _check_requirements() ## Add function to check existence of output files _install_1000g_test_files(os.path.join(os.path.dirname(__file__), "data", "production")) _install_phix() dbsnp = _install_dbsnp_entrez() (omni_out, hapmap_out, mills_out) = _install_training_data() _download_ucsc_genome_and_index() ## Install post_process file fh = open(POSTPROCESS, "w") fh.write(PPTEMPLATE.render(**{'store_dir':ARCHIVE, 'base_dir':PRODUCTION, 'dbsnp':dbsnp, 'omni':omni_out, 'hapmap':hapmap_out, 'mills':mills_out})) fh.close() ## Install index files for k, v in index_files.iteritems(): if not os.path.exists(os.path.dirname(v['file'])): safe_makedir(os.path.dirname(v['file'])) fh = open(v['file'], "w") fh.write(v['data'].getvalue()) fh.close() ## Make production dir if not os.path.exists(PRODUCTION): safe_makedir(PRODUCTION) ## Install files in production with run_bcbb_pipeline.py for k in FLOWCELL.keys(): install = False for ss in SAMPLESHEETS[k].split("\n"): vals = ss.split(",") if vals[0]=="FCID": continue outdir = os.path.join(PRODUCTION, "{}".format(vals[5].replace("__", ".")), "{}".format(vals[2]), "{}_{}".format(FLOWCELL[k].split("_")[0],FLOWCELL[k].split("_")[-1])) r1 = os.path.join(outdir, "{}_{}_L00{}_R1_001.fastq.gz".format(vals[2], vals[4], vals[1])) r2 = os.path.join(outdir, "{}_{}_L00{}_R2_001.fastq.gz".format(vals[2], vals[4], vals[1])) LOG.info("Looking for {} and {}".format(r1, r2)) if not os.path.exists(r1) or not os.path.exists(r2): install = True break if install: LOG.info("Installing files with run_bcbb_pipeline.py for flowcell {}".format(k)) cl = ["run_bcbb_pipeline.py", "-s", "-g", POSTPROCESS, os.path.join(ARCHIVE, FLOWCELL[k])] subprocess.check_call(cl) else: LOG.info("All files present; not running run_bcbb_pipeline.py") ## Run pipeline on samples pattern = "-bcbb-config.yaml$" yamlfiles = [] ## http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python ## [item for sublist in l for item in sublist] yamlfiles = [item for sublist in [filtered_walk(os.path.join(PROJECTDIR, x), filter_fn) for x in PROJECTS] for item in sublist] orig_dir = os.path.abspath(os.curdir) for yamlconfig in yamlfiles: try: LOG.info("cding to {}".format(os.path.abspath(os.curdir))) os.chdir(os.path.dirname(yamlconfig)) LOG.info("cding to {}".format(os.path.dirname(yamlconfig))) cl = ["automated_initial_analysis.py", POSTPROCESS, os.path.join(os.path.pardir, os.path.basename(os.path.dirname(yamlconfig))), yamlconfig] if not os.path.exists(os.path.join(os.path.dirname(yamlconfig), "14_write_metrics.txt")): LOG.info("Running pipeline: {}".format(" ".join(cl))) subprocess.check_call(cl) finally: os.chdir(orig_dir) LOG.info("Finished pipeline run and cd back to {}".format(orig_dir))