def update(self): if not self._check_pargs(["sample_prj"]): return url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs)) samples = s_con.get_samples(sample_prj=self.pargs.sample_prj) if self.pargs.project_id: self.app.log.debug("Going to update 'project_id' to {} for sample runs with 'sample_prj' == {}".format(self.pargs.project_id, self.pargs.sample_prj)) for s in samples: if not s.get("project_id", None) is None: if not query_yes_no("'project_id':{} for sample {}; are you sure you want to overwrite?".format(s["project_id"], s["name"]), force=self.pargs.force): continue s["project_id"] = self.pargs.project_id s_con.save(s) if self.pargs.names: self.app.log.debug("Going to update 'project_sample_name' for sample runs with 'sample_prj' == {}".format(self.pargs.sample_prj)) if os.path.exists(self.pargs.names): with open(self.pargs.names) as fh: names_d = json.load(fh) else: names_d= ast.literal_eval(self.pargs.names) samples_sort = sorted(samples, key=lambda s:s["barcode_name"]) groups = {} for k, g in itertools.groupby(samples_sort, key=lambda x:x["barcode_name"]): groups[k] = list(g) for barcode_name in names_d: sample_list = groups.get(barcode_name, None) if not sample_list: continue for s in sample_list: if not s.get("project_sample_name", None) is None: if not query_yes_no("'project_sample_name':{} for sample {}; are you sure you want to overwrite?".format(s["project_sample_name"], s["name"]), force=self.pargs.force): continue s["project_sample_name"] = names_d[barcode_name] s_con.save(s) else: self.app.log.info("Trying to use extensive matching...") p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs)) project_name = self.pargs.sample_prj if self.pargs.project_alias: project_name = self.pargs.project_alias for s in samples: project_sample = p_con.get_project_sample(project_name, s["barcode_name"], extensive_matching=True) if project_sample: self.app.log.info("using mapping '{} : {}'...".format(s["barcode_name"], project_sample["sample_name"])) s["project_sample_name"] = project_sample["sample_name"] s_con.save(s)
def purge_alignments(self): """Cleanup sam and bam files. In some cases, sam files persist. If the corresponding bam file exists, replace the sam file contents with a message that the file has been removed to save space. """ pattern = ".sam$" def purge_filter(f): if not pattern: return return re.search(pattern, f) != None flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), purge_filter) if len(flist) == 0: self.app.log.info("No sam files found") return if len(flist) > 0 and not query_yes_no("Going to remove/cleanup {} sam files ({}...). Are you sure you want to continue?".format(len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force): return for f in flist: self.app.log.info("Purging sam file {}".format(f)) self.app.cmd.safe_unlink(f) if os.path.exists(f.replace(".sam", ".bam")): self.app.cmd.write(f, "File removed to save disk space: SAM converted to BAM") ## Find bam files in alignments subfolders pattern = ".bam$" flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), purge_filter, include_dirs=["alignments"]) for f in flist: f_tgt = [f.replace(".bam", "-sort.bam"), os.path.join(os.path.dirname(os.path.dirname(f)),os.path.basename(f) )] for tgt in f_tgt: if os.path.exists(tgt): self.app.log.info("Purging bam file {}".format(f)) self.app.cmd.safe_unlink(f) self.app.cmd.write(f, "File removed to save disk space: Moved to {}".format(os.path.abspath(tgt)))
def touch_finished(self): if not self._check_pargs(["project", "sample"]): return if os.path.exists(self.pargs.sample) and os.path.isfile(self.pargs.sample): with open(self.pargs.sample) as fh: slist = [x.rstrip() for x in fh.readlines()] else: slist = [self.pargs.sample] for s in slist: spath = os.path.join(self._meta.root_path, self._meta.path_id, s) if not os.path.exists(spath): self.app.log.warn("No such path {}; skipping".format(spath)) continue rsync_src = os.path.join(self._meta.root_path, self._meta.path_id, s) + os.sep rsync_tgt = os.path.join(self.app.config.get("runqc", "root"), self.pargs.project, s) + os.sep cl = ["rsync {} {} {}".format(self.app.config.get("runqc", "rsync_sample_opts"), rsync_src, rsync_tgt)] self.app.log.info("Checking if runqc uptodate with command '{}'".format(" ".join(cl))) out = self.app.cmd.command(cl, **{'shell':True}) if not self.pargs.dry_run and not out.find("total size is 0"): self.app.log.info("Some files need to be updated. Rsync output:") print "********" print out print "********" continue if not query_yes_no("Going to touch file {} for sample {}; continue?".format(FINISHED_FILE, s), force=self.pargs.force): continue self.app.log.info("Touching file {} for sample {}".format(FINISHED_FILE, s)) with open(os.path.join(spath, FINISHED_FILE), "w") as fh: t_utc = utc_time() fh.write(t_utc)
def remove_finished(self): if not self._check_pargs(["project"]): return # Don't filter out files def filter_fn(f): return True slist = os.listdir(os.path.join(self._meta.root_path, self._meta.path_id)) for s in slist: spath = os.path.join(self._meta.root_path, self._meta.path_id, s) if not os.path.isdir(spath): continue if not os.path.exists(os.path.join(spath, FINISHED_FILE)): self.app.log.info("Sample {} not finished; skipping".format(s)) continue flist = filtered_walk(spath, filter_fn) dlist = filtered_walk(spath, filter_fn, get_dirs=True) if os.path.exists(os.path.join(spath, REMOVED_FILE)): self.app.log.info("Sample {} already removed; skipping".format(s)) continue if len(flist) > 0 and not query_yes_no("Will remove directory {} containing {} files; continue?".format(s, len(flist)), force=self.pargs.force): continue self.app.log.info("Removing {} files from {}".format(len(flist), spath)) for f in flist: if f == os.path.join(spath, FINISHED_FILE): continue self.app.cmd.safe_unlink(f) self.app.log.info("Removing {} directories from {}".format(len(dlist), spath)) for d in sorted(dlist, reverse=True): self.app.cmd.safe_rmdir(d) if not self.pargs.dry_run: with open(os.path.join(spath, REMOVED_FILE), "w") as fh: t_utc = utc_time() fh.write(t_utc)
def remove_files(f, **kw): ## Remove old files if requested keep_files = [ "-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$", "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$", "_[0-9]+_fastq.txt.gz$", "_[0-9]+_fastq.txt$", "^[0-9][0-9]_.*.txt$", "JOBID", "PID" ] pattern = "|".join(keep_files) def remove_filter_fn(f): return re.search(pattern, f) == None workdir = os.path.dirname(f) remove_files = filtered_walk(workdir, remove_filter_fn) remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True) if len(remove_files) == 0: pass if len(remove_files) > 0 and query_yes_no( "Going to remove {} files and {} directories... Are you sure you want to continue?" .format(len(remove_files), len(remove_dirs)), force=kw['force']): [dry_unlink(x, dry_run=kw['dry_run']) for x in remove_files] ## Sort directories by length so we don't accidentally try to remove a non-empty dir [ dry_rmdir(x, dry_run=kw['dry_run']) for x in sorted(remove_dirs, key=lambda x: len(x), reverse=True) ]
def best_practice(self): if not self._check_pargs(["project", "uppmax_project"]): return project_path = os.path.normpath( os.path.join("/proj", self.pargs.uppmax_project)) if not os.path.exists(project_path): self.log.warn("No such project {}; skipping".format( self.pargs.uppmax_project)) return if self.pargs.outdir: outpath = os.path.join(project_path, "INBOX", self.pargs.outdir) else: outpath = os.path.join( project_path, "INBOX", self.pargs.statusdb_project_name ) if self.pargs.statusdb_project_name else os.path.join( project_path, "INBOX", self.pargs.project) if not query_yes_no( "Going to deliver data to {}; continue?".format(outpath)): return if not os.path.exists(outpath): self.app.cmd.safe_makedir(outpath) kw = vars(self.pargs) basedir = os.path.abspath( os.path.join(self._meta.root_path, self._meta.path_id)) flist = find_samples(basedir, **vars(self.pargs)) if not len(flist) > 0: self.log.info("No samples/sample configuration files found") return def filter_fn(f): if not pattern: return return re.search(pattern, f) != None # Setup pattern plist = [".*.yaml$", ".*.metrics$"] if not self.pargs.no_bam: plist.append(".*-{}.bam$".format(self.pargs.bam_file_type)) plist.append(".*-{}.bam.bai$".format(self.pargs.bam_file_type)) if not self.pargs.no_vcf: plist.append(".*.vcf$") plist.append(".*.vcf.gz$") plist.append(".*.tbi$") plist.append(".*.tsv$") pattern = "|".join(plist) size = 0 for f in flist: path = os.path.dirname(f) sources = filtered_walk(path, filter_fn=filter_fn, exclude_dirs=BCBIO_EXCLUDE_DIRS) targets = [src.replace(basedir, outpath) for src in sources] self._transfer_files(sources, targets) if self.pargs.size: statinfo = [os.stat(src).st_size for src in sources] size = size + sum(statinfo) if self.pargs.size: self.app._output_data['stderr'].write( "\n********************************\nEstimated delivery size: {:.1f}G\n********************************" .format(size / 1e9))
def hs_metrics(self): if not self._check_pargs(["project", "region_file"]): return if not self.pargs.bait_file: self.pargs.bait_file = self.pargs.region_file self.log.info("hs_metrics: This is a temporary solution for calculating hs metrics for samples using picard tools") pattern = "{}.bam$".format(self.pargs.hs_file_type) def filter_fn(f): return re.search(pattern, f) != None ### FIX ME: this isn't caught by _process_args path = self.pargs.flowcell if self.pargs.flowcell else self.pargs.project flist = filtered_walk(os.path.join(self.config.get("production", "root"), path), filter_fn=filter_fn, exclude_dirs=['nophix', 'alignments', 'fastqc', 'fastq_screen']) if self.pargs.input_file: flist = [os.path.abspath(self.pargs.input_file)] if not query_yes_no("Going to run hs_metrics on {} files. Are you sure you want to continue?".format(len(flist)), force=self.pargs.force): return for f in flist: self.log.info("running CalculateHsMetrics on {}".format(f)) ### Issue with calling java from ### subprocess:http://stackoverflow.com/questions/9795249/issues-with-wrapping-java-program-with-pythons-subprocess-module ### Actually not an issue: command line arguments have to be done the right way cl = ["java"] + ["-{}".format(self.pargs.java_opts)] + ["-jar", "{}/CalculateHsMetrics.jar".format(os.getenv("PICARD_HOME"))] + ["INPUT={}".format(f)] + ["TARGET_INTERVALS={}".format(os.path.abspath(self.pargs.region_file))] + ["BAIT_INTERVALS={}".format(os.path.abspath(self.pargs.bait_file))] + ["OUTPUT={}".format(f.replace(".bam", ".hs_metrics"))] + ["VALIDATION_STRINGENCY=SILENT"] out = self.app.cmd.command(cl) if out: self.app._output_data["stdout"].write(out.rstrip())
def hs_metrics(self): if not self._check_pargs(["project", "targets"]): return if not self.pargs.baits: self.pargs.baits = self.pargs.targets self.log.info("hs_metrics: This is a temporary solution for calculating hs metrics for samples using picard tools") pattern = "{}.bam$".format(self.pargs.hs_file_type) def filter_fn(f): return re.search(pattern, f) != None ### FIX ME: this isn't caught by _process_args flist = [] path = self.pargs.flowcell if self.pargs.flowcell else self.pargs.project basedir = os.path.abspath(os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id)) samples = find_samples(basedir, **vars(self.pargs)) inc_dirs = [os.path.dirname(x) for x in samples] flist = filtered_walk(os.path.join(self.config.get(self.app.controller._meta.label, "root"), path), filter_fn=filter_fn, exclude_dirs=['nophix', 'alignments', 'fastqc', 'fastq_screen'], include_dirs=inc_dirs) if not query_yes_no("Going to run hs_metrics on {} files. Are you sure you want to continue?".format(len(flist)), force=self.pargs.force): return for f in flist: self.log.info("running CalculateHsMetrics on {}".format(f)) ### Issue with calling java from ### subprocess:http://stackoverflow.com/questions/9795249/issues-with-wrapping-java-program-with-pythons-subprocess-module ### Actually not an issue: command line arguments have to be done the right way cl = ["java"] + ["-{}".format(self.pargs.java_opts)] + ["-jar", "{}/CalculateHsMetrics.jar".format(os.getenv("PICARD_HOME"))] + ["INPUT={}".format(f)] + ["TARGET_INTERVALS={}".format(os.path.abspath(self.pargs.targets))] + ["BAIT_INTERVALS={}".format(os.path.abspath(self.pargs.baits))] + ["OUTPUT={}".format(f.replace(".bam", ".hs_metrics"))] + ["VALIDATION_STRINGENCY=SILENT"] out = self.app.cmd.command(cl) if out: self.app._output_data["stdout"].write(out.rstrip())
def rm_tarball(arch, tarball): """Remove a tarball """ if not query_yes_no("Going to remove tarball {}. This action can not be undone. Are you sure you want to continue?".format(tarball), force=arch.pargs.force): return arch.log.info("removing {}".format(tarball)) arch.app.cmd.safe_unlink(tarball)
def rm_run(arch, root, flowcell=None): """Remove a flowcell folder from the root folder """ path = os.path.join(root,flowcell) if not query_yes_no("Going to remove flowcell folder {}. This action can not be undone. Are you sure you want to continue?".format(path), force=arch.pargs.force): return arch.log.info("removing {}".format(path)) arch.app.cmd.rmtree(path)
def rm_tarball(arch, tarball): """Remove a tarball """ if not query_yes_no( "Going to remove tarball {}. This action can not be undone. Are you sure you want to continue?" .format(tarball), force=arch.pargs.force): return arch.log.info("removing {}".format(tarball)) arch.app.cmd.safe_unlink(tarball)
def _return_extensive_match_result(name_map, barcode_name, force=False): """Wrap return value for extensive matching""" if query_yes_no( "found mapping '{} : {}' (barcode_name:project_sample_name); do you want to use this project_sample_name?" .format(barcode_name, name_map["sample_name"]), default="no", force=force): return name_map else: return None
def rm_run(arch, root, flowcell=None): """Remove a flowcell folder from the root folder """ path = os.path.join(root, flowcell) if not query_yes_no( "Going to remove flowcell folder {}. This action can not be undone. Are you sure you want to continue?" .format(path), force=arch.pargs.force): return arch.log.info("removing {}".format(path)) arch.app.cmd.rmtree(path)
def run(self): if not self._check_pargs(["project"]): return if self.pargs.post_process: self.pargs.post_process = os.path.abspath(self.pargs.post_process) basedir = os.path.abspath(os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id)) if self.pargs.from_ssheet: [ samplesheet_csv_to_yaml(fn) for fn in find_samples(basedir, pattern="SampleSheet.csv$", **vars(self.pargs)) ] flist = find_samples(basedir, **vars(self.pargs)) # Add filtering on flowcell if necessary self._meta.pattern = ".*" flist = [x for x in flist if self._filter_fn(x)] if self.pargs.merged: ## Setup merged samples and append to flist if new list longer flist = setup_merged_samples(flist, **vars(self.pargs)) if not len(flist) > 0: self.log.info("No sample configuration files found") return if len(flist) > 0 and not query_yes_no( "Going to start {} jobs... Are you sure you want to continue?".format(len(flist)), force=self.pargs.force ): return # Make absolutely sure analysis directory is a *subdirectory* of the working directory validate_sample_directories(flist, basedir) orig_dir = os.path.abspath(os.getcwd()) for run_info in flist: os.chdir(os.path.abspath(os.path.dirname(run_info))) setup_sample(run_info, **vars(self.pargs)) os.chdir(orig_dir) if self.pargs.only_setup: return if self.pargs.only_failed: status = {x: self._sample_status(x) for x in flist} flist = [x for x in flist if self._sample_status(x) == "FAIL"] ## Here process files again, removing if requested, and running the pipeline for run_info in flist: self.app.log.info("Running analysis defined by config file {}".format(run_info)) os.chdir(os.path.abspath(os.path.dirname(run_info))) if self.app.cmd.monitor(work_dir=os.path.dirname(run_info)): self.app.log.warn("Not running job") continue if self.pargs.restart: self.app.log.info("Removing old analysis files in {}".format(os.path.dirname(run_info))) remove_files(run_info, **vars(self.pargs)) (cl, platform_args) = run_bcbb_command(run_info, **vars(self.pargs)) self.app.cmd.command( cl, **{"platform_args": platform_args, "saveJobId": True, "workingDirectory": os.path.dirname(run_info)} ) os.chdir(orig_dir)
def clean(self): if not self._check_pargs(["project"]): return self._meta.pattern = "|".join(["{}(.gz|.bz2)?$".format(x) for x in self._meta.file_ext]) flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), self._filter_fn, include_dirs=self._meta.include_dirs) if len(flist) == 0: self.app.log.info("No files matching pattern '{}' found".format(self._meta.pattern)) return if len(flist) > 0 and not query_yes_no("Going to remove {} files ({}...). Are you sure you want to continue?".format(len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force): return for f in flist: self.app.log.info("removing {}".format(f)) self.app.cmd.safe_unlink(f)
def _compress(self, label="compress"): if self.pargs.input_file: flist = [self.pargs.input_file] else: flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), self._filter_fn) if len(flist) == 0: self.app.log.info("No files matching pattern '{}' found".format(self._meta.pattern)) return if len(flist) > 0 and not query_yes_no("Going to {} {} files ({}...). Are you sure you want to continue?".format(label, len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force): sys.exit() for f in flist: self.log.info("{}ing {}".format(label, f)) self.app.cmd.command([self._meta.compress_prog, self._meta.compress_opt, "%s" % f], label, ignore_error=True, **{'workingDirectory':os.path.dirname(f), 'outputPath':os.path.join(os.path.dirname(f), "{}-{}-drmaa.log".format(label, os.path.basename(f)))})
def best_practice(self): if not self._check_pargs(["project", "uppmax_project"]): return project_path = os.path.normpath(os.path.join("/proj", self.pargs.uppmax_project)) if not os.path.exists(project_path): self.log.warn("No such project {}; skipping".format(self.pargs.uppmax_project)) return if self.pargs.outdir: outpath = os.path.join(project_path, "INBOX", self.pargs.outdir) else: outpath = os.path.join(project_path, "INBOX", self.pargs.statusdb_project_name) if self.pargs.statusdb_project_name else os.path.join(project_path, "INBOX", self.pargs.project) if not query_yes_no("Going to deliver data to {}; continue?".format(outpath)): return if not os.path.exists(outpath): self.app.cmd.safe_makedir(outpath) kw = vars(self.pargs) basedir = os.path.abspath(os.path.join(self._meta.root_path, self._meta.path_id)) flist = find_samples(basedir, **vars(self.pargs)) if self.pargs.flowcell: flist = [ fl for fl in flist if os.path.basename(os.path.dirname(fl)) == self.pargs.flowcell ] if not len(flist) > 0: self.log.info("No samples/sample configuration files found") return def filter_fn(f): if not pattern: return return re.search(pattern, f) != None # Setup pattern plist = [".*.yaml$", ".*.metrics$"] if not self.pargs.no_bam: plist.append(".*-{}.bam$".format(self.pargs.bam_file_type)) plist.append(".*-{}.bam.bai$".format(self.pargs.bam_file_type)) if not self.pargs.no_vcf: plist.append(".*.vcf$") plist.append(".*.vcf.gz$") plist.append(".*.tbi$") plist.append(".*.tsv$") pattern = "|".join(plist) size = 0 for f in flist: path = os.path.dirname(f) sources = filtered_walk(path, filter_fn=filter_fn, exclude_dirs=BCBIO_EXCLUDE_DIRS) targets = [src.replace(basedir, outpath) for src in sources] self._transfer_files(sources, targets) if self.pargs.size: statinfo = [os.stat(src).st_size for src in sources] size = size + sum(statinfo) if self.pargs.size: self.app._output_data['stderr'].write("\n********************************\nEstimated delivery size: {:.1f}G\n********************************".format(size/1e9))
def rm(self): if not self._check_pargs(["project", "analysis_id"]): return indir = os.path.join(self.app.controller._meta.project_root, self.app.controller._meta.path_id, self.pargs.analysis_id) assert os.path.exists(indir), "No such analysis {} for project {}".format(self.pargs.analysis_id, self.pargs.project) try: flist = walk(indir) except IOError as e: self.app.log.warn(str(e)) raise e if len(flist) > 0 and not query_yes_no("Going to remove all contents ({} files) of analysis {} for project {}... Are you sure you want to continue?".format(len(flist), self.pargs.analysis_id, self.pargs.project), force=self.pargs.force): return for f in flist: self.app.cmd.safe_unlink(f) self.app.log.info("removing {}".format(indir)) self.app.cmd.safe_rmdir(indir)
def clean(self): pattern = "|".join(["{}(.gz|.bz2)?$".format(x) for x in self._meta.file_pat]) def clean_filter(f): if not pattern: return return re.search(pattern , f) != None flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), clean_filter, include_dirs=self._meta.include_dirs) if len(flist) == 0: self.app.log.info("No files matching pattern {} found".format(pattern)) return if len(flist) > 0 and not query_yes_no("Going to remove {} files ({}...). Are you sure you want to continue?".format(len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force): return for f in flist: self.app.log.info("removing {}".format(f)) self.app.cmd.safe_unlink(f)
def run(self): if not self._check_pargs(["project", "post_process", "analysis_type"]): return ## Gather sample yaml files pattern = "-bcbb-config.yaml$" flist = [] if self.pargs.sample: if os.path.exists(self.pargs.sample): with open(self.pargs.sample) as fh: flist = [x.rstrip() for x in fh.readlines()] else: pattern = "{}{}".format(self.pargs.sample, pattern) def bcbb_yaml_filter(f): return re.search(pattern, f) != None if not flist: flist = filtered_walk(os.path.join(self.app.controller._meta.project_root, self.pargs.project, "data"), bcbb_yaml_filter) if self.pargs.only_failed: status = {x:self._sample_status(x) for x in flist} flist = [x for x in flist if self._sample_status(x)=="FAIL"] if len(flist) == 0 and self.pargs.sample: self.app.log.info("No such sample {}".format(self.pargs.sample)) if len(flist) > 0 and not query_yes_no("Going to start {} jobs... Are you sure you want to continue?".format(len(flist)), force=self.pargs.force): return for f in flist: with open(f) as fh: config = yaml.load(fh) if self.pargs.analysis_type: config["details"][0]["multiplex"][0]["analysis"] = self.pargs.analysis_type config["details"][0]["analysis"] = self.pargs.analysis_type if config["details"][0]["genome_build"] == 'unknown': config["details"][0]["genome_build"] = self.pargs.genome_build ## Check if files exist: if they don't, then change the suffix config["details"][0]["multiplex"][0]["files"].sort() if not os.path.exists(config["details"][0]["multiplex"][0]["files"][0]): if os.path.splitext(config["details"][0]["multiplex"][0]["files"][0])[1] == ".gz": config["details"][0]["multiplex"][0]["files"] = [x.replace(".gz", "") for x in config["details"][0]["multiplex"][0]["files"]] else: config["details"][0]["multiplex"][0]["files"] = ["{}.gz".format(x) for x in config["details"][0]["multiplex"][0]["files"]] config_file = f.replace("-bcbb-config.yaml", "-pm-bcbb-analysis-config.yaml") self.app.cmd.write(config_file, yaml.dump(config)) ## Run automated_initial_analysis.py cur_dir = os.getcwd() new_dir = os.path.abspath(os.path.dirname(f)) os.chdir(new_dir) self.app.cmd.command(['automated_initial_analysis.py', os.path.abspath(self.pargs.post_process), new_dir, config_file]) os.chdir(cur_dir)
def remove_files(f, **kw): ## Remove old files if requested keep_files = ["-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$", "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$", "_[0-9]+_fastq.txt.gz$", "_[0-9]+_fastq.txt$", "^[0-9][0-9]_.*.txt$", "JOBID", "PID"] pattern = "|".join(keep_files) def remove_filter_fn(f): return re.search(pattern, f) == None workdir = os.path.dirname(f) remove_files = filtered_walk(workdir, remove_filter_fn) remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True) if len(remove_files) == 0: pass if len(remove_files) > 0 and query_yes_no("Going to remove {} files and {} directories... Are you sure you want to continue?".format(len(remove_files), len(remove_dirs)), force=kw['force']): [dry_unlink(x, dry_run=kw['dry_run']) for x in remove_files] ## Sort directories by length so we don't accidentally try to remove a non-empty dir [dry_rmdir(x, dry_run=kw['dry_run']) for x in sorted(remove_dirs, key=lambda x: len(x), reverse=True)]
def touch_finished(self): if not self._check_pargs(["project", "sample"]): return if os.path.exists(self.pargs.sample) and os.path.isfile( self.pargs.sample): with open(self.pargs.sample) as fh: slist = [x.rstrip() for x in fh.readlines()] else: slist = [self.pargs.sample] for s in slist: spath = os.path.join(self._meta.root_path, self._meta.path_id, s) if not os.path.exists(spath): self.app.log.warn("No such path {}; skipping".format(spath)) continue rsync_src = os.path.join(self._meta.root_path, self._meta.path_id, s) + os.sep rsync_tgt = os.path.join(self.app.config.get("runqc", "root"), self.pargs.project, s) + os.sep cl = [ "rsync {} {} {}".format( self.app.config.get("runqc", "rsync_sample_opts"), rsync_src, rsync_tgt) ] self.app.log.info( "Checking if runqc uptodate with command '{}'".format( " ".join(cl))) out = self.app.cmd.command(cl, **{'shell': True}) if not self.pargs.dry_run and not out.find("total size is 0"): self.app.log.info( "Some files need to be updated. Rsync output:") print "********" print out print "********" continue if not query_yes_no( "Going to touch file {} for sample {}; continue?".format( FINISHED_FILE, s), force=self.pargs.force): continue self.app.log.info("Touching file {} for sample {}".format( FINISHED_FILE, s)) with open(os.path.join(spath, FINISHED_FILE), "w") as fh: t_utc = utc_time() fh.write(t_utc)
def run_halo(self): if self.app.pargs.setup: if not self._check_pargs(["project", "baits", "targets", "target_region"]): return else: if not self._check_pargs(["project"]): return basedir = os.path.abspath(os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id)) self.app.log.info("Going to look for samples in {}".format(basedir)) param_list = run_halo(path=basedir, **vars(self.pargs)) if self.app.pargs.setup: self.app.log.info("Setup configuration files. Rerun command without '--setup' option to run analysis") return if not len(param_list) > 0: self.log.info("No samples found in {}; perhaps you need to add the '--data' option to look in the {} directory".format(self.app.pargs.project, os.path.join(self.app.pargs.project, "data"))) if len(param_list) > 0 and not query_yes_no("Going to start {} jobs... Are you sure you want to continue?".format(len(param_list)), force=self.pargs.force): return for param in param_list: self.app.cmd.command(param['cl'], **param)
def _compress(self, pattern, label="compress"): def compress_filter(f): if not pattern: return return re.search(pattern, f) != None if self.pargs.input_file: flist = [self.pargs.input_file] else: flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), compress_filter) if len(flist) == 0: self.app.log.info("No files matching pattern {} found".format(pattern)) return if len(flist) > 0 and not query_yes_no("Going to {} {} files ({}...). Are you sure you want to continue?".format(label, len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force): sys.exit() for f in flist: self.log.info("{}ing {}".format(label, f)) self.app.cmd.command([self._meta.compress_prog, self._meta.compress_opt, "%s" % f], label, ignore_error=True)
def remove_finished(self): if not self._check_pargs(["project"]): return # Don't filter out files def filter_fn(f): return True slist = os.listdir( os.path.join(self._meta.root_path, self._meta.path_id)) for s in slist: spath = os.path.join(self._meta.root_path, self._meta.path_id, s) if not os.path.isdir(spath): continue if not os.path.exists(os.path.join(spath, FINISHED_FILE)): self.app.log.info("Sample {} not finished; skipping".format(s)) continue flist = filtered_walk(spath, filter_fn) dlist = filtered_walk(spath, filter_fn, get_dirs=True) if os.path.exists(os.path.join(spath, REMOVED_FILE)): self.app.log.info( "Sample {} already removed; skipping".format(s)) continue if len(flist) > 0 and not query_yes_no( "Will remove directory {} containing {} files; continue?". format(s, len(flist)), force=self.pargs.force): continue self.app.log.info("Removing {} files from {}".format( len(flist), spath)) for f in flist: if f == os.path.join(spath, FINISHED_FILE): continue self.app.cmd.safe_unlink(f) self.app.log.info("Removing {} directories from {}".format( len(dlist), spath)) for d in sorted(dlist, reverse=True): self.app.cmd.safe_rmdir(d) if not self.pargs.dry_run: with open(os.path.join(spath, REMOVED_FILE), "w") as fh: t_utc = utc_time() fh.write(t_utc)
def run(self): if not self._check_pargs(["project"]): return if self.pargs.post_process: self.pargs.post_process = os.path.abspath(self.pargs.post_process) basedir = os.path.abspath( os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id)) if self.pargs.from_ssheet: [ samplesheet_csv_to_yaml(fn) for fn in find_samples( basedir, pattern="SampleSheet.csv$", **vars(self.pargs)) ] flist = find_samples(basedir, **vars(self.pargs)) # Add filtering on flowcell if necessary self._meta.pattern = ".*" flist = [x for x in flist if self._filter_fn(x)] if self.pargs.merged: ## Setup merged samples and append to flist if new list longer flist = setup_merged_samples(flist, **vars(self.pargs)) if not len(flist) > 0: self.log.info("No sample configuration files found") return if len(flist) > 0 and not query_yes_no( "Going to start {} jobs... Are you sure you want to continue?". format(len(flist)), force=self.pargs.force): return # Make absolutely sure analysis directory is a *subdirectory* of the working directory validate_sample_directories(flist, basedir) orig_dir = os.path.abspath(os.getcwd()) for run_info in flist: os.chdir(os.path.abspath(os.path.dirname(run_info))) setup_sample(run_info, **vars(self.pargs)) os.chdir(orig_dir) if self.pargs.only_setup: return if self.pargs.only_failed: status = {x: self._sample_status(x) for x in flist} flist = [x for x in flist if self._sample_status(x) == "FAIL"] ## Here process files again, removing if requested, and running the pipeline for run_info in flist: self.app.log.info( "Running analysis defined by config file {}".format(run_info)) os.chdir(os.path.abspath(os.path.dirname(run_info))) if self.app.cmd.monitor(work_dir=os.path.dirname(run_info)): self.app.log.warn("Not running job") continue if self.pargs.restart: self.app.log.info("Removing old analysis files in {}".format( os.path.dirname(run_info))) remove_files(run_info, **vars(self.pargs)) (cl, platform_args) = run_bcbb_command(run_info, **vars(self.pargs)) self.app.cmd.command( cl, **{ 'platform_args': platform_args, 'saveJobId': True, 'workingDirectory': os.path.dirname(run_info) }) os.chdir(orig_dir)
def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project database" self.log.debug("Connecting to flowcell database") f_con = FlowcellRunMetricsConnection(**vars(self.pargs)) assert f_con, "Could not get connection to flowcell database" self.log.debug("Connecting to x_flowcell database") x_con = X_FlowcellRunMetricsConnection(**vars(self.pargs)) assert x_con, "Could not get connection to x_flowcell database" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry( self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error( "Uppmax project was not specified and could not be fetched from project database" ) return # Setup paths and verify parameters self._meta.production_root = self.pargs.root if self.pargs.root else self.app.config.get( "production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists( self._meta.production_root ), "No such directory {}; check your production config".format( self._meta.production_root) assert os.path.exists( proj_base_dir), "No project {} in production path {}".format( self.pargs.project, self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get( "deliver", "uppnex_project_root") except Exception as e: self.log.warn( "{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get( "deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn( "{}, will use 'INBOX' as uppnext_project_delivery_path".format( e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root, self.pargs.uppmax_project, self._meta.uppnex_delivery_dir) assert os.path.exists( destination_root ), "Delivery destination folder {} does not exist".format( destination_root) destination_root = os.path.join(destination_root, self.pargs.project) # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files( proj_base_dir=proj_base_dir, sample=self.pargs.sample, flowcell=self.pargs.flowcell) if len(uncompressed) > 0: self.log.error( "There are uncompressed fastq file for project, kindly check all files are compressed properly before delivery" ) return # Extract the list of samples and runs associated with the project and sort them samples = self.samples_to_copy( pid=p_con.get_entry(self.pargs.project, "project_id"), pod=p_con.get_entry(self.pargs.project, "open_date"), fc_dict={ 'HiSeq2500': f_con.proj_list, 'HiSeqX': x_con.proj_list }, proj_base_dir=proj_base_dir, destination_root=destination_root, sample=self.pargs.sample, flowcell=self.pargs.flowcell) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = {} for sample in samples: if query_yes_no("Deliver sample {} ?".format(sample), default="no"): to_process[sample] = samples[sample] samples = to_process if self.pargs.sample: sample = samples.get(self.pargs.sample) if not sample: self.log.error( "There is no such sample {} for project {}".format( self.pargs.sample, self.pargs.project)) return samples = {self.pargs.sample: sample} self.log.info( "Will deliver data for {} samples from project {} to {}".format( len(samples), self.pargs.project, destination_root)) if not query_yes_no("Continue?"): return # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no( "Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample for sample, flowcells in samples.iteritems(): for fc, files in flowcells.iteritems(): self.log.info("Processing sample {} and flowcell {}".format( sample, fc)) # transfer files self.log.debug("Transferring {} fastq files".format( len(files['src']))) self._transfer_files(sources=files['src'], targets=files['dst']) passed = True if self.pargs.link or self.pargs.dry_run: passed = False else: # calculate md5sums on the source side and write it on the destination md5 = [] for s, d in zip(files['src'], files['dst']): m = md5sum(s) mfile = "{}.md5".format(d) md5.append([m, mfile, s]) self.log.debug("md5sum for source file {}: {}".format( s, m)) # write the md5sum to a file at the destination and verify the transfer for m, mfile, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug( "Writing md5sum to file {}".format(mfile)) self.app.cmd.write( mfile, "{} {}".format(m, os.path.basename(dstfile)), True) self.log.debug( "Verifying md5sum for file {}".format(dstfile)) dm = md5sum(dstfile) self.log.debug( "md5sum for destination file {}: {}".format( dstfile, dm)) if m != dm: self.log.warn( "md5sum verification FAILED for {}. Source: {}, Target: {}" .format(dstfile, m, dm)) self.log.warn( "Improperly transferred file {} is removed from destination, please retry transfer of this file" .format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod( f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile( os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule", self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: data = { 'raw_data_delivery': { 'timestamp': utc_time(), 'files': { os.path.splitext( (os.path.basename(srcpath)))[0]: { 'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath } for m, mfile, srcpath in md5 } } } jsonstr = json.dumps(data) jsonfile = os.path.join( proj_base_dir, sample, fc, "{}_{}_raw_data_delivery.json".format(sample, fc)) self.log.debug( "Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True) self.log.debug( "Saving delivery in StatusDB document {}".format(id)) if self.proj_flowcells[fc]['type'] == 'HiSeqX': fc_con = x_con else: fc_con = f_con fc_obj = fc_con.get_entry(fc) self.log.info( "Logging delivery to StatusDB document {}".format( fc_obj.get('_id'))) fc_raw_data = fc_obj.get('raw_data_delivery', {}) fc_raw_data.update(data['raw_data_delivery']) fc_obj['raw_data_delivery'] = fc_raw_data self._save(fc_con, fc_obj) self.log.debug(jsonstr)
def main(): parser = argparse.ArgumentParser(description="A script to help doing the deliveries, now using the Casava directory structure. " \ "The user is asked to provide a project ID, a run name, and an UPPMAX project") parser.add_argument('-c', '--casava-path', action="store", dest="caspath", default='/proj/a2010002/nobackup/illumina/', help="Specify a path to a Casava directory manually") parser.add_argument('-l', '--log-path', action="store", dest="logpath", default='/proj/a2010002/private/delivery_logs', help="Specify a path to a log file") parser.add_argument('-i', '--interactive', action="store_true", dest="interactive", default=False, help="Interactively select samples to be delivered") parser.add_argument('-d', '--dry-run', action="store_true", dest="dry", default=False, help="Dry run: nothing will be done") parser.add_argument( '-a', '--deliver-all-fcs', action="store_true", dest="deliver_all_fcs", default=False, help= "rsync samples from all flow cells. Default is to only deliver from specified flowcell" ) parser.add_argument( '-p', '--nophix', action="store_true", dest="deliver_nophix", default=False, help= "Deliver fastq files from nophix subdirectory. Default is to deliver from run directory" ) parser.add_argument('-g', '--group', action="store", dest="group", default="uppmax", help="Group membership to set on copied files") parser.add_argument('project_name', action='store', help="Project name to deliver, e.g. J.Doe_10_01") parser.add_argument('flowcell_id', action='store', help="Flowcell id to deliver, e.g. 120824_BD1915ACXX") parser.add_argument('uppmax_id', action='store', help="UPPMAX project id to deliver to, e.g. b2012001") args = parser.parse_args() if not args.project_name in os.listdir(args.caspath): print("Could not find project. Check directory listing:") for f in os.listdir(args.caspath): print(f) clean_exit(0, None, args.dry) fcid = args.flowcell_id fcid_comp = fcid.split('_') if len(fcid_comp) > 2: fcid = fcid_comp[0] + '_' + fcid_comp[-1] print("FCID format too long, trying {:s}".format(fcid)) dt = datetime.now() time_str = "_".join([ str(dt.year), str(dt.month), str(dt.day), str(dt.hour), str(dt.minute), str(dt.second) ]) logfilename = os.path.join(os.path.normpath(args.logpath), "{:s}.log".format(time_str)) if not args.dry: logfile = open(logfilename, "w") else: logfile = sys.stdout logfile.write("[{:s}] - Project to move files for:\n{:s}\n".format( utc_time(), args.project_name)) logfile.flush() proj_base_dir = os.path.join(args.caspath, args.project_name) skip_list = [] if args.interactive: for sample_dir in os.listdir(proj_base_dir): if not os.path.isdir(os.path.join(proj_base_dir, sample_dir)): continue if not query_yes_no("Deliver sample {:s}?".format(sample_dir), default="no"): skip_list.append(sample_dir) created_proj_dir_name = fixProjName(args.project_name) del_path_top = '/proj/' + args.uppmax_id + "/INBOX/" + created_proj_dir_name to_copy = get_file_copy_list(proj_base_dir, del_path_top, fcid, args.deliver_all_fcs, args.deliver_nophix, skip_list) # Prompt user if any of the files are non-compressed for fqfile, _, _ in to_copy: if os.path.splitext(fqfile)[1] == ".gz": continue print("WARNING: The file {:s}, which you are about to deliver, does not seem to be compressed. " \ "It is recommended that you compress files prior to delivery.".format(fqfile)) if query_yes_no("Do you wish to continue delivering " \ "uncompressed fastq files?", default="yes"): break clean_exit(1, logfile, args.dry) rsync_files(to_copy, logfile, args.group, args.dry) clean_exit(0, logfile, args.dry)
def purge_alignments(path, ftype="sam", keep="last", dry_run=False, force=False, fsize=MINFILESIZE): """Cleanup sam and bam files. In some cases, sam files persist. If the corresponding bam file exists, replace the sam file contents with a message that the file has been removed to save space. In general, several bam files are produced in an analysis. By grouping bam files by prefix, either the most recent file is retained for further reference, or a specific analysis is kept. """ if ftype == "sam": pattern = ".sam$" elif ftype == "bam": pattern = ".bam$" else: LOG.warn("ftype must be one of 'sam' or 'bam'") return LOG.debug( "running purge_alignments in path {} with pattern {} keep rule {}". format(path, pattern, keep)) def purge_filter(f): if not pattern: return return re.search(pattern, f) != None flist = filtered_walk(path, purge_filter, exclude_dirs=["realign-split"]) if len(flist) == 0: LOG.info("No {} files found in {}".format(ftype, path)) return if len(flist) > 0 and not query_yes_no( "Going to remove/cleanup {} {} files ({}...). Are you sure you want to continue?" .format(len(flist), ftype, ",".join( [os.path.basename(x) for x in flist[0:10]])), force=force): return if ftype == "sam": for f in flist: LOG.info("Purging {} file {}".format(ftype, f)) dry_unlink(f, dry_run) if os.path.exists(f.replace(".sam", ".bam")): dry_write( f, "File removed to save disk space: SAM converted to BAM", dry_run) return elif ftype == "bam": samples = {} for f in flist: m = re.search("([0-9A-Za-z\_]+)-.*", os.path.basename(f)) if not m: LOG.debug("Couldn't determine prefix for {}".format(f)) continue sid = m.groups()[0] if not sid in samples.keys(): samples[sid] = {} dname = os.path.dirname(f) if not dname in samples[sid].keys(): samples[sid][dname] = [] samples[sid][dname].append(f) saved_size = 0 for k in samples.iterkeys(): for d, files in samples[k].iteritems(): if not files or len(files) == 1: continue files.sort(lambda x, y: cmp(len(x), len(y))) if keep == "last": LOG.info( "Keeping file {} and removing all files with common prefix: {}" .format( os.path.basename(files[len(files) - 1]), ", ".join( [os.path.basename(x) for x in files[0:-1]]))) saved_size = _purge_by_sample(files, dry_run, int(fsize)) + saved_size LOG.info("Will save approximately {:.1f}G space".format(saved_size / 1e9))
def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project databse" self.log.debug("Connecting to samples database") s_con = SampleRunMetricsConnection(**vars(self.pargs)) assert s_con, "Could not get connection to samples databse" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry( self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error( "Uppmax project was not specified and could not be fetched from project database" ) return # Extract the list of samples and runs associated with the project and sort them samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name', 'NA'), k.get('flowcell', 'NA'), k.get('lane', 'NA'))) # Setup paths and verify parameters self._meta.production_root = self.app.config.get("production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists( self._meta.production_root ), "No such directory {}; check your production config".format( self._meta.production_root) assert os.path.exists( proj_base_dir), "No project {} in production path {}".format( self.pargs.project, self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get( "deliver", "uppnex_project_root") except Exception as e: self.log.warn( "{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get( "deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn( "{}, will use 'INBOX' as uppnext_project_delivery_path".format( e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root, self.pargs.uppmax_project, self._meta.uppnex_delivery_dir) assert os.path.exists( destination_root ), "Delivery destination folder {} does not exist".format( destination_root) destination_root = os.path.join(destination_root, self.pargs.project) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = [] for sample in samples: sname = sample.get("project_sample_name") index = sample.get("sequence") fcid = sample.get("flowcell") lane = sample.get("lane") date = sample.get("date") self.log.info( "Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}" .format(sname, index, fcid, lane, date)) if query_yes_no("Deliver sample?", default="no"): to_process.append(sample) samples = to_process # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files( proj_base_dir, samples) if len(uncompressed) > 0: self.log.warn( "The following samples have uncompressed *.fastq files that cannot be delivered: {}" .format(",".join(uncompressed))) if not query_yes_no("Continue anyway?", default="no"): return self.log.info( "Will deliver data for {} samples from project {} to {}".format( len(samples), self.pargs.project, destination_root)) if not query_yes_no("Continue?"): return # Get the list of files to transfer and the destination self.log.debug("Gathering list of files to copy") to_copy = self.get_file_copy_list(proj_base_dir, destination_root, samples) # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no( "Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample run for id, files in to_copy.items(): # get the sample database object [sample] = [s for s in samples if s.get('_id') == id] self.log.info("Processing sample {} and flowcell {}".format( sample.get("project_sample_name", "NA"), sample.get("flowcell", "NA"))) # calculate md5sums on the source side and write it on the destination md5 = [] for f in files: m = md5sum(f[0]) mfile = "{}.md5".format(f[1]) md5.append([m, mfile, f[2], f[0]]) self.log.debug("md5sum for source file {}: {}".format(f[0], m)) # transfer files self.log.debug("Transferring {} fastq files".format(len(files))) self._transfer_files([f[0] for f in files], [f[1] for f in files]) # write the md5sum to a file at the destination and verify the transfer passed = True for m, mfile, read, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug("Writing md5sum to file {}".format(mfile)) self.app.cmd.write( mfile, "{} {}".format(m, os.path.basename(dstfile)), True) self.log.debug("Verifying md5sum for file {}".format(dstfile)) # if dry-run, make sure verification pass if self.pargs.dry_run: dm = m else: dm = md5sum(dstfile) self.log.debug("md5sum for destination file {}: {}".format( dstfile, dm)) if m != dm: self.log.warn( "md5sum verification FAILED for {}. Source: {}, Target: {}" .format(dstfile, m, dm)) self.log.warn( "Improperly transferred file {} is removed from destination, please retry transfer of this file" .format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod( f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile( os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule", self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: self.log.info( "Logging delivery to StatusDB document {}".format(id)) data = { 'raw_data_delivery': { 'timestamp': utc_time(), 'files': { 'R{}'.format(read): { 'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath } for m, mfile, read, srcpath in md5 }, } } jsonstr = json.dumps(data) jsonfile = os.path.join( os.path.dirname(md5[0][3]), "{}_{}_{}_{}_L{}_raw_data_delivery.json".format( sample.get("date"), sample.get("flowcell"), sample.get("project_sample_name"), sample.get("sequence"), sample.get("lane"))) self.log.debug( "Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True) self.log.debug( "Saving delivery in StatusDB document {}".format(id)) sample.update(data) self._save(s_con, sample) self.log.debug(jsonstr)
def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project databse" self.log.debug("Connecting to samples database") s_con = SampleRunMetricsConnection(**vars(self.pargs)) assert s_con, "Could not get connection to samples databse" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry(self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error("Uppmax project was not specified and could not be fetched from project database") return # Extract the list of samples and runs associated with the project and sort them samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name','NA'), k.get('flowcell','NA'), k.get('lane','NA'))) # Setup paths and verify parameters self._meta.production_root = self.app.config.get("production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists(self._meta.production_root), "No such directory {}; check your production config".format(self._meta.production_root) assert os.path.exists(proj_base_dir), "No project {} in production path {}".format(self.pargs.project,self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get("deliver", "uppnex_project_root") except Exception as e: self.log.warn("{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get("deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn("{}, will use 'INBOX' as uppnext_project_delivery_path".format(e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root,self.pargs.uppmax_project,self._meta.uppnex_delivery_dir) assert os.path.exists(destination_root), "Delivery destination folder {} does not exist".format(destination_root) destination_root = os.path.join(destination_root,self.pargs.project) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = [] for sample in samples: sname = sample.get("project_sample_name") index = sample.get("sequence") fcid = sample.get("flowcell") lane = sample.get("lane") date = sample.get("date") self.log.info("Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}".format(sname, index, fcid, lane, date)) if query_yes_no("Deliver sample?", default="no"): to_process.append(sample) samples = to_process # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files(proj_base_dir,samples) if len(uncompressed) > 0: self.log.warn("The following samples have uncompressed *.fastq files that cannot be delivered: {}".format(",".join(uncompressed))) if not query_yes_no("Continue anyway?", default="no"): return self.log.info("Will deliver data for {} samples from project {} to {}".format(len(samples),self.pargs.project,destination_root)) if not query_yes_no("Continue?"): return # Get the list of files to transfer and the destination self.log.debug("Gathering list of files to copy") to_copy = self.get_file_copy_list(proj_base_dir, destination_root, samples) # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no("Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample run for id, files in to_copy.items(): # get the sample database object [sample] = [s for s in samples if s.get('_id') == id] self.log.info("Processing sample {} and flowcell {}".format(sample.get("project_sample_name","NA"),sample.get("flowcell","NA"))) # transfer files self.log.debug("Transferring {} fastq files".format(len(files))) self._transfer_files([f[0] for f in files], [f[1] for f in files]) passed = True if self.pargs.link or self.pargs.dry_run: passed = False else: # calculate md5sums on the source side and write it on the destination md5 = [] for f in files: m = md5sum(f[0]) mfile = "{}.md5".format(f[1]) md5.append([m,mfile,f[2],f[0]]) self.log.debug("md5sum for source file {}: {}".format(f[0],m)) # write the md5sum to a file at the destination and verify the transfer for m, mfile, read, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug("Writing md5sum to file {}".format(mfile)) self.app.cmd.write(mfile,"{} {}".format(m,os.path.basename(dstfile)),True) self.log.debug("Verifying md5sum for file {}".format(dstfile)) dm = md5sum(dstfile) self.log.debug("md5sum for destination file {}: {}".format(dstfile,dm)) if m != dm: self.log.warn("md5sum verification FAILED for {}. Source: {}, Target: {}".format(dstfile,m,dm)) self.log.warn("Improperly transferred file {} is removed from destination, please retry transfer of this file".format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod(f,stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile(os.path.join("/sw","uppmax","var","inboxfix","schedule",self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: self.log.info("Logging delivery to StatusDB document {}".format(id)) data = {'raw_data_delivery': {'timestamp': utc_time(), 'files': {'R{}'.format(read):{'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath} for m, mfile, read, srcpath in md5}, } } jsonstr = json.dumps(data) jsonfile = os.path.join(os.path.dirname(md5[0][3]), "{}_{}_{}_{}_L{}_raw_data_delivery.json".format(sample.get("date"), sample.get("flowcell"), sample.get("project_sample_name"), sample.get("sequence"), sample.get("lane"))) self.log.debug("Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile,data=jsonstr,overwrite=True) self.log.debug("Saving delivery in StatusDB document {}".format(id)) sample.update(data) self._save(s_con,sample) self.log.debug(jsonstr)
def main(): parser = argparse.ArgumentParser(description="A script to help doing the deliveries, now using the Casava directory structure. " \ "The user is asked to provide a project ID, a run name, and an UPPMAX project") parser.add_argument('-c', '--casava-path', action="store", dest="caspath", default='/proj/a2010002/nobackup/illumina/', help="Specify a path to a Casava directory manually") parser.add_argument('-l', '--log-path', action="store", dest="logpath", default='/proj/a2010002/private/delivery_logs', help="Specify a path to a log file") parser.add_argument('-i', '--interactive', action="store_true", dest="interactive", default=False, help="Interactively select samples to be delivered") parser.add_argument('-d', '--dry-run', action="store_true", dest="dry", default=False, help="Dry run: nothing will be done") parser.add_argument('-a', '--deliver-all-fcs', action="store_true", dest="deliver_all_fcs", default=False, help="rsync samples from all flow cells. Default is to only deliver from specified flowcell") parser.add_argument('-p', '--nophix', action="store_true", dest="deliver_nophix", default=False, help="Deliver fastq files from nophix subdirectory. Default is to deliver from run directory") parser.add_argument('project_name', action='store', help="Project name to deliver, e.g. J.Doe_10_01") parser.add_argument('flowcell_id', action='store', help="Flowcell id to deliver, e.g. 120824_BD1915ACXX") parser.add_argument('uppmax_id', action='store', help="UPPMAX project id to deliver to, e.g. b2012001") args = parser.parse_args() if not args.project_name in os.listdir(args.caspath): print("Could not find project. Check directory listing:") for f in os.listdir(args.caspath): print(f) clean_exit(0,None,args.dry) fcid = args.flowcell_id fcid_comp = fcid.split('_') if len(fcid_comp) > 2: fcid = fcid_comp[0] + '_' + fcid_comp[-1] print("FCID format too long, trying {:s}".format(fcid)) dt = datetime.now() time_str = "_".join([str(dt.year), str(dt.month), str(dt.day), str(dt.hour), str(dt.minute), str(dt.second)]) logfilename = os.path.join(os.path.normpath(args.logpath),"{:s}.log".format(time_str)) if not args.dry: logfile = open(logfilename, "w") else: logfile = sys.stdout logfile.write("[{:s}] - Project to move files for:\n{:s}\n".format(utc_time(), args.project_name)) logfile.flush() proj_base_dir = os.path.join(args.caspath, args.project_name) skip_list = [] if args.interactive: for sample_dir in os.listdir(proj_base_dir): if not os.path.isdir(os.path.join(proj_base_dir,sample_dir)): continue if not query_yes_no("Deliver sample {:s}?".format(sample_dir), default="no"): skip_list.append(sample_dir) created_proj_dir_name = fixProjName(args.project_name) del_path_top = '/proj/' + args.uppmax_id + "/INBOX/" + created_proj_dir_name to_copy = get_file_copy_list(proj_base_dir, del_path_top, fcid, args.deliver_all_fcs, args.deliver_nophix, skip_list) # Prompt user if any of the files are non-compressed for fqfile, _, _ in to_copy: if os.path.splitext(fqfile)[1] == ".gz": continue print("WARNING: The file {:s}, which you are about to deliver, does not seem to be compressed. " \ "It is recommended that you compress files prior to delivery.".format(fqfile)) if query_yes_no("Do you wish to continue delivering " \ "uncompressed fastq files?", default="yes"): break clean_exit(1,logfile,args.dry) rsync_files(to_copy, logfile, args.dry) clean_exit(0,logfile,args.dry)
def upload_tarball(arch, tarball, remote_host=None, remote_path=None, remote_user=None, **kw): """Upload the tarball to the remote destination """ if not remote_path: arch.log.error("A remote path must be specified in the config or on the command line") return False source_files = {'tarball': tarball, 'tarball_md5': "{}.md5".format(tarball)} arch.log.debug("Verifying that md5sum file {} exists".format(source_files['tarball_md5'])) if not os.path.exists(source_files['tarball_md5']): arch.log.warn("md5 file {} does not exist".format(source_files['tarball_md5'])) if not query_yes_no("Calculate md5 file and proceed?", force=arch.pargs.force): return False # Calculate the md5sum arch.app.cmd.md5sum(source_files['tarball']) remote_location = "{}{}".format("{}@".format(remote_user) if remote_user else "", "{}:".format(remote_host) if remote_host else "") # Transfer the md5 file and tarball remote_files = {} for label in source_files.keys(): remote_files[label] = "{}{}".format(remote_location, os.path.join(remote_path,os.path.basename(source_files[label]))) arch.log.debug("Transferring {} to {}".format(source_files[label],remote_files[label])) arch.app.cmd.transfer_file(source_files[label],remote_files[label]) # Verify the transfer on the remote side using fabric (if necessary) use_fabric = remote_host is not None and remote_host != "localhost" passed = False arch.log.debug("Verifying integrity of remote file {} after transfer".format(remote_files['tarball'])) if use_fabric: # Verify the md5sum using fabric host, path = remote_files['tarball_md5'].split(':') result = execute(verify_upload,path,host=host) passed = result.get(host,False) else: passed = arch.app.cmd.verify_md5sum(remote_files['tarball_md5']) # If the verification was not successful, prompt to delete the corrupt files if not passed: arch.log.error("md5 sum of remote file {} does not match after transfer".format(remote_files['tarball'])) if query_yes_no("Remove the corrupted remote file {}?".format(remote_files['tarball']), force=arch.pargs.force): for path in remote_files.values(): arch.log.info("removing {}".format(path)) if use_fabric: path = path.split(':')[-1] execute(rm_file,path,host=host) else: arch.app.cmd.safe_unlink(path) arch.log.error("Upload of {} to remote destination failed".format(source_files['tarball'])) else: arch.log.info("{} uploaded to {} successfully".format(source_files['tarball'],remote_files['tarball'])) if use_fabric: disconnect_all() return passed
def _return_extensive_match_result(name_map, barcode_name, force=False): """Wrap return value for extensive matching""" if query_yes_no("found mapping '{} : {}' (barcode_name:project_sample_name); do you want to use this project_sample_name?".format(barcode_name, name_map["sample_name"]), default="no", force=force): return name_map else: return None
def purge_alignments(path, ftype="sam", keep="last", dry_run=False, force=False, fsize=MINFILESIZE): """Cleanup sam and bam files. In some cases, sam files persist. If the corresponding bam file exists, replace the sam file contents with a message that the file has been removed to save space. In general, several bam files are produced in an analysis. By grouping bam files by prefix, either the most recent file is retained for further reference, or a specific analysis is kept. """ if ftype == "sam": pattern = ".sam$" elif ftype == "bam": pattern = ".bam$" else: LOG.warn("ftype must be one of 'sam' or 'bam'") return LOG.debug("running purge_alignments in path {} with pattern {} keep rule {}".format(path, pattern, keep)) def purge_filter(f): if not pattern: return return re.search(pattern, f) != None flist = filtered_walk(path, purge_filter, exclude_dirs=["realign-split"]) if len(flist) == 0: LOG.info("No {} files found in {}".format(ftype, path)) return if len(flist) > 0 and not query_yes_no("Going to remove/cleanup {} {} files ({}...). Are you sure you want to continue?".format(len(flist), ftype, ",".join([os.path.basename(x) for x in flist[0:10]])), force=force): return if ftype == "sam": for f in flist: LOG.info("Purging {} file {}".format(ftype, f)) dry_unlink(f, dry_run) if os.path.exists(f.replace(".sam", ".bam")): dry_write(f, "File removed to save disk space: SAM converted to BAM", dry_run) return elif ftype == "bam": samples = {} for f in flist: m = re.search("([0-9A-Za-z\_]+)-.*", os.path.basename(f)) if not m: LOG.debug("Couldn't determine prefix for {}".format(f)) continue sid = m.groups()[0] if not sid in samples.keys(): samples[sid] = {} dname = os.path.dirname(f) if not dname in samples[sid].keys(): samples[sid][dname] = [] samples[sid][dname].append(f) saved_size = 0 for k in samples.iterkeys(): for d, files in samples[k].iteritems(): if not files or len(files) == 1: continue files.sort(lambda x,y: cmp(len(x), len(y))) if keep == "last": LOG.info("Keeping file {} and removing all files with common prefix: {}".format(os.path.basename(files[len(files)-1]), ", ".join([os.path.basename(x) for x in files[0:-1]]))) saved_size = _purge_by_sample(files, dry_run, int(fsize)) + saved_size LOG.info("Will save approximately {:.1f}G space".format(saved_size / 1e9))
def upload_tarball(arch, tarball, remote_host=None, remote_path=None, remote_user=None, **kw): """Upload the tarball to the remote destination """ if not remote_path: arch.log.error( "A remote path must be specified in the config or on the command line" ) return False source_files = { 'tarball': tarball, 'tarball_md5': "{}.md5".format(tarball) } arch.log.debug("Verifying that md5sum file {} exists".format( source_files['tarball_md5'])) if not os.path.exists(source_files['tarball_md5']): arch.log.warn("md5 file {} does not exist".format( source_files['tarball_md5'])) if not query_yes_no("Calculate md5 file and proceed?", force=arch.pargs.force): return False # Calculate the md5sum arch.app.cmd.md5sum(source_files['tarball']) remote_location = "{}{}".format( "{}@".format(remote_user) if remote_user else "", "{}:".format(remote_host) if remote_host else "") # Transfer the md5 file and tarball remote_files = {} for label in source_files.keys(): remote_files[label] = "{}{}".format( remote_location, os.path.join(remote_path, os.path.basename(source_files[label]))) arch.log.debug("Transferring {} to {}".format(source_files[label], remote_files[label])) arch.app.cmd.transfer_file(source_files[label], remote_files[label]) # Verify the transfer on the remote side using fabric (if necessary) use_fabric = remote_host is not None and remote_host != "localhost" passed = False arch.log.debug( "Verifying integrity of remote file {} after transfer".format( remote_files['tarball'])) if use_fabric: # Verify the md5sum using fabric host, path = remote_files['tarball_md5'].split(':') result = execute(verify_upload, path, host=host) passed = result.get(host, False) else: passed = arch.app.cmd.verify_md5sum(remote_files['tarball_md5']) # If the verification was not successful, prompt to delete the corrupt files if not passed: arch.log.error( "md5 sum of remote file {} does not match after transfer".format( remote_files['tarball'])) if query_yes_no("Remove the corrupted remote file {}?".format( remote_files['tarball']), force=arch.pargs.force): for path in remote_files.values(): arch.log.info("removing {}".format(path)) if use_fabric: path = path.split(':')[-1] execute(rm_file, path, host=host) else: arch.app.cmd.safe_unlink(path) arch.log.error("Upload of {} to remote destination failed".format( source_files['tarball'])) else: arch.log.info("{} uploaded to {} successfully".format( source_files['tarball'], remote_files['tarball'])) if use_fabric: disconnect_all() return passed