def setup_merged_samples(flist, sample_group_fn=_group_samples, **kw): """Setup analysis that merges multiple sample runs. :param flist: list of file names, by default *-bcbb-config.yaml files :param sample_group_fn: function that groups files into samples and sample runs. The function takes flist as input. :returns: updated flist with config files for merged samples """ new_flist = [] sample_d = sample_group_fn(flist) for k, v in sample_d.iteritems(): if len(v) > 1: f = v[v.keys()[0]] out_d = os.path.join(os.path.dirname(os.path.dirname(f)), MERGED_SAMPLE_OUTPUT_DIR) LOG.info("Sample {} has {} sample runs; setting up merge analysis in {}".format(k, len(v), out_d)) dry_makedir(out_d, dry_run=False) pp = kw.get("post_process") if kw.get("post_process", None) else f.replace("-bcbb-config.yaml", "-post_process.yaml") with open(pp) as fh: conf = yaml.load(fh) conf = update_pp_platform_args(conf, **{'jobname': "{}_total".format(k), 'workdir': out_d, 'output': "{}_total-bcbb.log".format(k) }) pp_new = os.path.join(out_d, os.path.basename(pp)) dry_unlink(pp_new, dry_run=kw.get('dry_run', True)) dry_write(pp_new, yaml.safe_dump(conf, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw.get('dry_run', True)) # Setup merged bcbb-config file bcbb_config = merge_sample_config(v.values(), sample=k, out_d=out_d, dry_run=kw.get('dry_run', True)) bcbb_config_file = os.path.join(out_d, os.path.basename(v.values()[0])) bcbb_config = sort_sample_config_fastq(bcbb_config) if not os.path.exists(bcbb_config_file) or kw.get('new_config', False): dry_unlink(bcbb_config_file, dry_run=kw.get('dry_run', True)) dry_write(bcbb_config_file, yaml.safe_dump(bcbb_config, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw.get('dry_run', True)) ##new_flist.extend(v.values()) new_flist.extend([bcbb_config_file]) return new_flist
def setup_merged_samples(flist, sample_group_fn=_group_samples, **kw): """Setup analysis that merges multiple sample runs. :param flist: list of file names, by default *-bcbb-config.yaml files :param sample_group_fn: function that groups files into samples and sample runs. The function takes flist as input. :returns: updated flist with config files for merged samples """ new_flist = [] sample_d = sample_group_fn(flist) for k, v in sample_d.iteritems(): if len(v): f = v[v.keys()[0]] out_d = os.path.join(os.path.dirname(os.path.dirname(f)), MERGED_SAMPLE_OUTPUT_DIR) LOG.info("Sample {} has {} sample runs; setting up merge analysis in {}".format(k, len(v), out_d)) dry_makedir(out_d, dry_run=False) pp = kw.get("post_process",f.replace("-bcbb-config.yaml", "-post_process.yaml")) with open(pp) as fh: conf = yaml.load(fh) conf = update_pp_platform_args(conf, **{'jobname': "{}_total".format(k), 'workdir': out_d, 'output': "{}_total-bcbb.log".format(k) }) pp_new = os.path.join(out_d, os.path.basename(pp)) dry_unlink(pp_new, dry_run=kw.get('dry_run', True)) dry_write(pp_new, yaml.safe_dump(conf, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw.get('dry_run', True)) # Setup merged bcbb-config file bcbb_config = merge_sample_config(v.values(), sample=k, out_d=out_d, dry_run=kw.get('dry_run', True)) bcbb_config_file = os.path.join(out_d, os.path.basename(v.values()[0])) bcbb_config = sort_sample_config_fastq(bcbb_config, path=out_d) if not os.path.exists(bcbb_config_file) or kw.get('new_config', False): dry_unlink(bcbb_config_file, dry_run=kw.get('dry_run', True)) dry_write(bcbb_config_file, yaml.safe_dump(bcbb_config, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw.get('dry_run', True)) ##new_flist.extend(v.values()) new_flist.extend([bcbb_config_file]) return new_flist
def _purge_by_sample(files, dry_run, fsize=MINFILESIZE): saved_size = 0 for i in range(0, len(files)-1): f1 = os.path.basename(files[i]) f2 = os.path.basename(files[i+1]) if f2.startswith(os.path.splitext(f1)[0]): statinfo = os.stat(files[i]) if statinfo.st_size < fsize: continue saved_size = saved_size + statinfo.st_size LOG.info("Purging bam file {}".format(files[i])) dry_unlink(files[i], dry_run) dry_write(files[i], "File removed to save disk space: Moved to {}".format(files[i+1]), dry_run) return saved_size
def remove_files(f, **kw): ## Remove old files if requested keep_files = [ "-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$", "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$", "_[0-9]+_fastq.txt.gz$", "_[0-9]+_fastq.txt$", "^[0-9][0-9]_.*.txt$", "JOBID", "PID" ] pattern = "|".join(keep_files) def remove_filter_fn(f): return re.search(pattern, f) == None workdir = os.path.dirname(f) remove_files = filtered_walk(workdir, remove_filter_fn) remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True) if len(remove_files) == 0: pass if len(remove_files) > 0 and query_yes_no( "Going to remove {} files and {} directories... Are you sure you want to continue?" .format(len(remove_files), len(remove_dirs)), force=kw['force']): [dry_unlink(x, dry_run=kw['dry_run']) for x in remove_files] ## Sort directories by length so we don't accidentally try to remove a non-empty dir [ dry_rmdir(x, dry_run=kw['dry_run']) for x in sorted(remove_dirs, key=lambda x: len(x), reverse=True) ]
def _purge_by_sample(files, dry_run, fsize=MINFILESIZE): saved_size = 0 for i in range(0, len(files) - 1): f1 = os.path.basename(files[i]) f2 = os.path.basename(files[i + 1]) if f2.startswith(os.path.splitext(f1)[0]): statinfo = os.stat(files[i]) if statinfo.st_size < fsize: continue saved_size = saved_size + statinfo.st_size LOG.info("Purging bam file {}".format(files[i])) dry_unlink(files[i], dry_run) dry_write( files[i], "File removed to save disk space: Moved to {}".format( files[i + 1]), dry_run) return saved_size
def remove_files(f, **kw): ## Remove old files if requested keep_files = ["-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$", "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$", "_[0-9]+_fastq.txt.gz$", "_[0-9]+_fastq.txt$", "^[0-9][0-9]_.*.txt$", "JOBID", "PID"] pattern = "|".join(keep_files) def remove_filter_fn(f): return re.search(pattern, f) == None workdir = os.path.dirname(f) remove_files = filtered_walk(workdir, remove_filter_fn) remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True) if len(remove_files) == 0: pass if len(remove_files) > 0 and query_yes_no("Going to remove {} files and {} directories... Are you sure you want to continue?".format(len(remove_files), len(remove_dirs)), force=kw['force']): [dry_unlink(x, dry_run=kw['dry_run']) for x in remove_files] ## Sort directories by length so we don't accidentally try to remove a non-empty dir [dry_rmdir(x, dry_run=kw['dry_run']) for x in sorted(remove_dirs, key=lambda x: len(x), reverse=True)]
def purge_alignments(path, ftype="sam", keep="last", dry_run=False, force=False, fsize=MINFILESIZE): """Cleanup sam and bam files. In some cases, sam files persist. If the corresponding bam file exists, replace the sam file contents with a message that the file has been removed to save space. In general, several bam files are produced in an analysis. By grouping bam files by prefix, either the most recent file is retained for further reference, or a specific analysis is kept. """ if ftype == "sam": pattern = ".sam$" elif ftype == "bam": pattern = ".bam$" else: LOG.warn("ftype must be one of 'sam' or 'bam'") return LOG.debug( "running purge_alignments in path {} with pattern {} keep rule {}". format(path, pattern, keep)) def purge_filter(f): if not pattern: return return re.search(pattern, f) != None flist = filtered_walk(path, purge_filter, exclude_dirs=["realign-split"]) if len(flist) == 0: LOG.info("No {} files found in {}".format(ftype, path)) return if len(flist) > 0 and not query_yes_no( "Going to remove/cleanup {} {} files ({}...). Are you sure you want to continue?" .format(len(flist), ftype, ",".join( [os.path.basename(x) for x in flist[0:10]])), force=force): return if ftype == "sam": for f in flist: LOG.info("Purging {} file {}".format(ftype, f)) dry_unlink(f, dry_run) if os.path.exists(f.replace(".sam", ".bam")): dry_write( f, "File removed to save disk space: SAM converted to BAM", dry_run) return elif ftype == "bam": samples = {} for f in flist: m = re.search("([0-9A-Za-z\_]+)-.*", os.path.basename(f)) if not m: LOG.debug("Couldn't determine prefix for {}".format(f)) continue sid = m.groups()[0] if not sid in samples.keys(): samples[sid] = {} dname = os.path.dirname(f) if not dname in samples[sid].keys(): samples[sid][dname] = [] samples[sid][dname].append(f) saved_size = 0 for k in samples.iterkeys(): for d, files in samples[k].iteritems(): if not files or len(files) == 1: continue files.sort(lambda x, y: cmp(len(x), len(y))) if keep == "last": LOG.info( "Keeping file {} and removing all files with common prefix: {}" .format( os.path.basename(files[len(files) - 1]), ", ".join( [os.path.basename(x) for x in files[0:-1]]))) saved_size = _purge_by_sample(files, dry_run, int(fsize)) + saved_size LOG.info("Will save approximately {:.1f}G space".format(saved_size / 1e9))
def setup_sample(f, analysis, amplicon=False, genome_build="hg19", **kw): """Setup config files, making backups and writing new files :param path: root path in which to search for samples :param dry_run: dry run flag """ if not os.path.exists(f): return with open(f) as fh: config = yaml.load(fh) ## Check for correctly formatted config if not config.get("details", None): LOG.warn("Couldn't find 'details' section in config file {}: aborting setup!".format(f)) return ## Save file to backup if backup doesn't exist f_bak = f.replace("-bcbb-config.yaml", "-bcbb-config.yaml.bak") if not os.path.exists(f_bak): LOG.info("Making backup of {} in {}".format(f, f_bak)) dry_backup(os.path.abspath(f), dry_run=kw['dry_run']) ## Save command file to backup if it doesn't exist cmdf = f.replace("-bcbb-config.yaml", "-bcbb-command.txt") if os.path.exists(cmdf): cmdf_bak = cmdf.replace("-bcbb-command.txt", "-bcbb-command.txt.bak") if not os.path.exists(cmdf_bak): LOG.info("Making backup of {} in {}".format(cmdf, cmdf_bak)) dry_backup(os.path.abspath(cmdf), dry_run=kw['dry_run']) ## Save post_process file to backup if it doesn't exist ppf = f.replace("-bcbb-config.yaml", "-post_process.yaml") if os.path.exists(ppf): ppf_bak = ppf.replace("-post_process.yaml", "-post_process.yaml.bak") if not os.path.exists(ppf_bak): LOG.info("Making backup of {} in {}".format(ppf, ppf_bak)) dry_backup(ppf, dry_run=kw['dry_run']) if analysis: config = update_sample_config(config, "analysis", analysis) if genome_build: config = update_sample_config(config, "genome_build", genome_build) config = sort_sample_config_fastq(config) ## Remove config file and rewrite dry_unlink(f, kw['dry_run']) dry_write(f, yaml.safe_dump(config, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw['dry_run']) ## Setup post process only if not provided at command line if not kw.get("post_process", None): ppfile = f.replace("-bcbb-config.yaml", "-post_process.yaml") with open(ppfile) as fh: pp = yaml.load(fh) ## Need to set working directory to path of bcbb-config.yaml file if pp.get('distributed', {}).get('platform_args', None): platform_args = pp['distributed']['platform_args'].split() if "-D" in platform_args: platform_args[platform_args.index("-D")+1] = os.path.dirname(f) elif "--workdir" in platform_args: platform_args[platform_args.index("--workdir")+1] = os.path.dirname(f) pp['distributed']['platform_args'] = " ".join(platform_args) ## Change keys for all analyses for anl in pp.get('custom_algorithms',{}).keys(): if kw.get('baits', None): pp['custom_algorithms'][anl]['hybrid_bait'] = kw['baits'] if kw.get('targets', None): pp['custom_algorithms'][anl]['hybrid_target'] = kw['targets'] if amplicon: pp['custom_algorithms'][anl]['mark_duplicates'] = False if amplicon: LOG.info("setting amplicon analysis") pp['algorithm']['mark_duplicates'] = False if kw.get('galaxy_config', None): pp['galaxy_config'] = kw['galaxy_config'] if kw.get('distributed', None): LOG.info("setting distributed execution") pp['algorithm']['num_cores'] = 'messaging' else: LOG.info("setting parallell execution") pp['algorithm']['num_cores'] = kw['num_cores'] if kw.get('snpEff', None): LOG.info("setting snpEff to {}".format(kw["snpEff"])) pp['program']['snpEff'] = kw['snpEff'] dry_unlink(ppfile, dry_run=kw['dry_run']) dry_write(ppfile, yaml.safe_dump(pp, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw['dry_run'])
def purge_alignments(path, ftype="sam", keep="last", dry_run=False, force=False, fsize=MINFILESIZE): """Cleanup sam and bam files. In some cases, sam files persist. If the corresponding bam file exists, replace the sam file contents with a message that the file has been removed to save space. In general, several bam files are produced in an analysis. By grouping bam files by prefix, either the most recent file is retained for further reference, or a specific analysis is kept. """ if ftype == "sam": pattern = ".sam$" elif ftype == "bam": pattern = ".bam$" else: LOG.warn("ftype must be one of 'sam' or 'bam'") return LOG.debug("running purge_alignments in path {} with pattern {} keep rule {}".format(path, pattern, keep)) def purge_filter(f): if not pattern: return return re.search(pattern, f) != None flist = filtered_walk(path, purge_filter, exclude_dirs=["realign-split"]) if len(flist) == 0: LOG.info("No {} files found in {}".format(ftype, path)) return if len(flist) > 0 and not query_yes_no("Going to remove/cleanup {} {} files ({}...). Are you sure you want to continue?".format(len(flist), ftype, ",".join([os.path.basename(x) for x in flist[0:10]])), force=force): return if ftype == "sam": for f in flist: LOG.info("Purging {} file {}".format(ftype, f)) dry_unlink(f, dry_run) if os.path.exists(f.replace(".sam", ".bam")): dry_write(f, "File removed to save disk space: SAM converted to BAM", dry_run) return elif ftype == "bam": samples = {} for f in flist: m = re.search("([0-9A-Za-z\_]+)-.*", os.path.basename(f)) if not m: LOG.debug("Couldn't determine prefix for {}".format(f)) continue sid = m.groups()[0] if not sid in samples.keys(): samples[sid] = {} dname = os.path.dirname(f) if not dname in samples[sid].keys(): samples[sid][dname] = [] samples[sid][dname].append(f) saved_size = 0 for k in samples.iterkeys(): for d, files in samples[k].iteritems(): if not files or len(files) == 1: continue files.sort(lambda x,y: cmp(len(x), len(y))) if keep == "last": LOG.info("Keeping file {} and removing all files with common prefix: {}".format(os.path.basename(files[len(files)-1]), ", ".join([os.path.basename(x) for x in files[0:-1]]))) saved_size = _purge_by_sample(files, dry_run, int(fsize)) + saved_size LOG.info("Will save approximately {:.1f}G space".format(saved_size / 1e9))
def setup_sample(f, analysis, amplicon=False, genome_build="hg19", **kw): """Setup config files, making backups and writing new files :param path: root path in which to search for samples :param dry_run: dry run flag """ if not os.path.exists(f): return with open(f) as fh: config = yaml.load(fh) ## Check for correctly formatted config if not config.get("details", None): LOG.warn("Couldn't find 'details' section in config file {}: aborting setup!".format(f)) return ## Save file to backup if backup doesn't exist f_bak = f.replace("-bcbb-config.yaml", "-bcbb-config.yaml.bak") if not os.path.exists(f_bak): LOG.info("Making backup of {} in {}".format(f, f_bak)) dry_backup(os.path.abspath(f), dry_run=kw['dry_run']) ## Save command file to backup if it doesn't exist cmdf = f.replace("-bcbb-config.yaml", "-bcbb-command.txt") if os.path.exists(cmdf): cmdf_bak = cmdf.replace("-bcbb-command.txt", "-bcbb-command.txt.bak") if not os.path.exists(cmdf_bak): LOG.info("Making backup of {} in {}".format(cmdf, cmdf_bak)) dry_backup(os.path.abspath(cmdf), dry_run=kw['dry_run']) ## Save post_process file to backup if it doesn't exist ppf = f.replace("-bcbb-config.yaml", "-post_process.yaml") if os.path.exists(ppf): ppf_bak = ppf.replace("-post_process.yaml", "-post_process.yaml.bak") if not os.path.exists(ppf_bak): LOG.info("Making backup of {} in {}".format(ppf, ppf_bak)) dry_backup(ppf, dry_run=kw['dry_run']) if analysis: config = update_sample_config(config, "analysis", analysis) if genome_build: config = update_sample_config(config, "genome_build", genome_build) config = sort_sample_config_fastq(config) ## Remove config file and rewrite dry_unlink(f, kw['dry_run']) dry_write(f, yaml.safe_dump(config, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw['dry_run']) ## Setup post process only if not provided at command line if not kw.get("post_process", None): ppfile = f.replace("-bcbb-config.yaml", "-post_process.yaml") with open(ppfile) as fh: pp = yaml.load(fh) ## Need to set working directory to path of bcbb-config.yaml file if pp.get('distributed', {}).get('platform_args', None): platform_args = pp['distributed']['platform_args'].split() if "-D" in platform_args: platform_args[platform_args.index("-D")+1] = os.path.dirname(f) elif "--workdir" in platform_args: platform_args[platform_args.index("--workdir")+1] = os.path.dirname(f) pp['distributed']['platform_args'] = " ".join(platform_args) ## Change keys for all analyses for anl in pp.get('custom_algorithms',{}).keys(): if kw.get('baits', None): pp['custom_algorithms'][anl]['hybrid_bait'] = kw['baits'] if kw.get('targets', None): pp['custom_algorithms'][anl]['hybrid_target'] = kw['targets'] if amplicon: pp['custom_algorithms'][anl]['mark_duplicates'] = False if amplicon: LOG.info("setting amplicon analysis") pp['algorithm']['mark_duplicates'] = False if kw.get('galaxy_config', None): pp['galaxy_config'] = kw['galaxy_config'] if kw.get('distributed', None): LOG.info("setting distributed execution") pp['algorithm']['num_cores'] = 'messaging' elif kw.get('num_cores', None): LOG.info("setting parallell execution") pp['algorithm']['num_cores'] = kw['num_cores'] if kw.get('snpEff', None): LOG.info("setting snpEff to {}".format(kw["snpEff"])) pp['program']['snpEff'] = kw['snpEff'] dry_unlink(ppfile, dry_run=kw['dry_run']) dry_write(ppfile, yaml.safe_dump(pp, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw['dry_run'])