def r_library_installer(config): """Install R libraries using CRAN and Bioconductor. """ if config.get("cran") or config.get("bioc") or config.get("github"): with shared._make_tmp_dir() as tmp_dir: with cd(tmp_dir): # Create an Rscript file with install details. out_file = os.path.join(tmp_dir, "install_packages.R") _make_install_script(out_file, config) # run the script and then get rid of it # try using either rlib_installed = False rscripts = [] conda_bin = shared._conda_cmd(env) if conda_bin: rscripts.append(fabutils.find_cmd(env, os.path.join(os.path.dirname(conda_bin), "Rscript"), "--version")) rscripts.append(fabutils.find_cmd(env, "Rscript", "--version")) for rscript in rscripts: if rscript: env.safe_run("%s %s" % (rscript, out_file)) rlib_installed = True break if not rlib_installed: env.logger.warn("Rscript not found; skipping install of R libraries.") env.safe_run("rm -f %s" % out_file)
def r_library_installer(config): """Install R libraries using CRAN and Bioconductor. """ if config.get("cran") or config.get("bioc") or config.get("github"): with shared._make_tmp_dir() as tmp_dir: with cd(tmp_dir): # Create an Rscript file with install details. out_file = os.path.join(tmp_dir, "install_packages.R") _make_install_script(out_file, config) # run the script and then get rid of it # try using either rlib_installed = False rscripts = [] conda_bin = shared._conda_cmd(env) if conda_bin: rscripts.append( fabutils.find_cmd( env, os.path.join(os.path.dirname(conda_bin), "Rscript"), "--version")) rscripts.append(fabutils.find_cmd(env, "Rscript", "--version")) for rscript in rscripts: if rscript: env.safe_run("%s %s" % (rscript, out_file)) rlib_installed = True break if not rlib_installed: env.logger.warn( "Rscript not found; skipping install of R libraries.") env.safe_run("rm -f %s" % out_file)
def _select_by_gid(env, orig_file): if gid == "hg19": env.safe_run("grep ^chr %s > %s" % (orig_file, out_file)) else: assert gid == "GRCh37" env.safe_run("grep -v ^chr %s > %s" % (orig_file, out_file)) return out_file
def _download_background_vcf(gid): """Download background file of variant to use in calling. """ base_url = "https://s3.amazonaws.com/biodata/variants" base_name = "background-diversity-1000g.vcf" if gid in ["GRCh37"] and not env.safe_exists("{0}.gz".format(base_name)): for ext in ["gz", "gz.tbi"]: env.safe_run("wget -c {0}/{1}.{2}".format(base_url, base_name, ext))
def _fix_chrom_names(env, orig_file): if gid == "hg19": convert_cmd = "| grep -v ^GL | grep -v ^NC | grep -v ^hs | sed 's/^/chr/'" else: assert gid == "GRCh37" convert_cmd = "" env.safe_run("zcat %s %s | bgzip -c > %s" % (orig_file, convert_cmd, out_file)) return out_file
def _create_local_virtualenv(target_dir): """Create virtualenv in target directory for non-sudo installs. """ url = "https://raw.github.com/pypa/virtualenv/master/virtualenv.py" if not os.path.exists(os.path.join(target_dir, "bin", "python")): with _make_tmp_dir() as work_dir: with cd(work_dir): env.safe_run("wget --no-check-certificate %s" % url) env.safe_run("python virtualenv.py %s" % target_dir)
def _dbsnp_mouse(env, gid): """Retrieve resources for mouse variant analysis from custom S3 biodata bucket. """ remote_dir = "https://s3.amazonaws.com/biodata/variants/" files = {"mm10": ["mm10-dbSNP-2013-09-12.vcf"]} for f in files[gid]: for ext in ["", ".idx"]: fname = f + ext if not env.safe_exists(fname): out_file = shared._remote_fetch(env, "%s%s.gz" % (remote_dir, fname)) env.safe_run("gunzip %s" % out_file)
def _dbsnp_mouse(env, gid): """Retrieve resources for mouse variant analysis from custom S3 biodata bucket. """ remote_dir = "https://s3.amazonaws.com/biodata/variants/" files = {"mm10": ["mm10-dbSNP-2013-09-12.vcf"]} for f in files[gid]: for ext in ["", ".idx"]: fname = f + ext if not env.safe_exists(fname): out_file = shared._remote_fetch( env, "%s%s.gz" % (remote_dir, fname)) env.safe_run("gunzip %s" % out_file)
def _dbsnp_mouse(env, gid): """Retrieve resources for mouse variant analysis from custom S3 biodata bucket. """ remote_dir = "https://s3.amazonaws.com/biodata/variants/" files = {"mm10": ["mm10-dbSNP-2013-09-12.vcf"]} for f in files[gid]: for ext in ["", ".idx"]: fname = f + ext if not env.safe_exists(fname): url = "%s%s.gz" % (remote_dir, fname) env.safe_run("wget -O %s -c %s" % (os.path.basename(url), url)) env.safe_run("gunzip %s" % os.path.basename(url))
def _make_install_script(out_file, config): if env.safe_exists(out_file): env.safe_run("rm -f %s" % out_file) env.safe_run("touch %s" % out_file) lib_loc = os.path.join(env.system_install, "lib", "R", "site-library") env.safe_sudo("mkdir -p %s" % lib_loc) with settings(warn_only=True): env.safe_sudo("chown -R %s %s" % (env.user, lib_loc)) repo_info = """ .libPaths(c("%s")) library(methods) cran.repos <- getOption("repos") cran.repos["CRAN" ] <- "%s" options(repos=cran.repos) source("%s") """ % (lib_loc, config["cranrepo"], config["biocrepo"]) env.safe_append(out_file, repo_info) install_fn = """ repo.installer <- function(repos, install.fn) { %s maybe.install <- function(pname) { if (!(pname %%in%% installed.packages())) install.fn(pname) } } """ if config.get("update_packages", True): update_str = """ update.packages(lib.loc="%s", repos=repos, ask=FALSE) """ % lib_loc else: update_str = "\n" env.safe_append(out_file, install_fn % update_str) std_install = """ std.pkgs <- c(%s) std.installer = repo.installer(cran.repos, install.packages) lapply(std.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran'])) env.safe_append(out_file, std_install) if len(config.get("bioc", [])) > 0: bioc_install = """ bioc.pkgs <- c(%s) bioc.installer = repo.installer(biocinstallRepos(), biocLite) lapply(bioc.pkgs, bioc.installer) """ % (", ".join('"%s"' % p for p in config['bioc'])) env.safe_append(out_file, bioc_install) if config.get("cran-after-bioc"): std2_install = """ std2.pkgs <- c(%s) lapply(std2.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran-after-bioc'])) env.safe_append(out_file, std2_install)
def download_dbnsfp(genomes): """Back compatible download target for dbNSFP, to be moved to GGD recipes. """ folder_name = "variation" genome_dir = os.path.join(env.data_files, "genomes") gids = set(["hg19", "GRCh37"]) for (orgname, gid, manager) in ((o, g, m) for (o, g, m) in genomes if g in gids and m.config.get("dbnsfp")): vrn_dir = os.path.join(genome_dir, orgname, gid, folder_name) if not env.safe_exists(vrn_dir): env.safe_run('mkdir -p %s' % vrn_dir) with cd(vrn_dir): _download_dbnsfp(env, gid, manager.config)
def _download_lcrs_custom(env, gid): """Retrieve low complexity regions from other sources. mm10 from Brent Pedersen: http://figshare.com/articles/LCR_mm10_bed_gz/1180124 """ urls = {"mm10": "http://files.figshare.com/1688228/LCR_mm10.bed.gz"} out_file = "LCR.bed.gz" cur_url = urls.get(gid) if cur_url and not env.safe_exists(out_file): def _bgzip_file(env, orig_file): env.safe_run("zcat %s | bgzip -c > %s" % (orig_file, out_file)) return out_file shared._remote_fetch(env, cur_url, fix_fn=_bgzip_file) env.safe_run("tabix -p vcf -f %s" % out_file)
def download_dbsnp(genomes, bundle_version, dbsnp_version): """Download and install dbSNP variation data for supplied genomes. """ folder_name = "variation" genome_dir = os.path.join(env.data_files, "genomes") for (orgname, gid, manager) in ((o, g, m) for (o, g, m) in genomes if m.config.get("dbsnp", False)): vrn_dir = os.path.join(genome_dir, orgname, gid, folder_name) if not env.safe_exists(vrn_dir): env.safe_run("mkdir -p %s" % vrn_dir) with cd(vrn_dir): if gid in ["GRCh37", "hg19"]: _dbsnp_human(env, gid, manager, bundle_version, dbsnp_version) elif gid in ["mm10", "canFam3"]: _dbsnp_custom(env, gid)
def _download_broad_bundle(gid, bundle_version, name, ext): broad_fname = "{name}.{gid}.vcf{ext}".format(gid=gid, name=name, ext=ext) fname = broad_fname.replace(".{0}".format(gid), "").replace(".sites", "") base_url = "ftp://gsapubftp-anonymous:@ftp.broadinstitute.org/bundle/" + \ "{bundle}/{gid}/{fname}.gz".format( bundle=bundle_version, fname=broad_fname, gid=gid) if not env.safe_exists(fname): out_file = shared._remote_fetch(env, base_url, allow_fail=True) if out_file: env.safe_run("gunzip %s" % out_file) env.safe_run("mv %s %s" % (broad_fname, fname)) else: env.logger.warn("dbSNP resources not available for %s" % gid) return fname
def _make_install_script(out_file, config): if env.safe_exists(out_file): env.safe_run("rm -f %s" % out_file) env.safe_run("touch %s" % out_file) lib_loc = os.path.join(env.system_install, "lib", "R", "site-library") env.safe_sudo("mkdir -p %s" % lib_loc) repo_info = """ .libPaths(c("%s")) library(methods) cran.repos <- getOption("repos") cran.repos["CRAN" ] <- "%s" options(repos=cran.repos) source("%s") """ % (lib_loc, config["cranrepo"], config["biocrepo"]) env.safe_append(out_file, repo_info) install_fn = """ repo.installer <- function(repos, install.fn) { %s maybe.install <- function(pname) { if (!(pname %%in%% installed.packages())) install.fn(pname) } } """ if config.get("update_packages", True): update_str = """ update.packages(lib.loc="%s", repos=repos, ask=FALSE) """ % lib_loc else: update_str = "\n" env.safe_append(out_file, install_fn % update_str) std_install = """ std.pkgs <- c(%s) std.installer = repo.installer(cran.repos, install.packages) lapply(std.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran'])) env.safe_append(out_file, std_install) if len(config.get("bioc", [])) > 0: bioc_install = """ bioc.pkgs <- c(%s) bioc.installer = repo.installer(biocinstallRepos(), biocLite) lapply(bioc.pkgs, bioc.installer) """ % (", ".join('"%s"' % p for p in config['bioc'])) env.safe_append(out_file, bioc_install) if config.get("cran-after-bioc"): std2_install = """ std2.pkgs <- c(%s) lapply(std2.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran-after-bioc'])) env.safe_append(out_file, std2_install)
def _ensembl_vcf(env, gid, manager): """Fetch ensemble vcf file (available from release 71) and do tabix indexing """ fname = "%s.vcf.gz" % (manager._organism) download_url = manager._base_url section = "variation/" if not manager._section is "standard": section = "" fname = fname.lower() download_url += "release-%s/%svcf/%s/%s" % (manager._release_number, section, manager._organism.lower(), fname) if not env.safe_exists(fname): shared._remote_fetch(env, download_url) env.safe_run("tabix -f -p vcf %s" % fname)
def r_library_installer(config): """Install R libraries using CRAN and Bioconductor. """ with shared._make_tmp_dir() as tmp_dir: with cd(tmp_dir): # Create an Rscript file with install details. out_file = os.path.join(tmp_dir, "install_packages.R") _make_install_script(out_file, config) # run the script and then get rid of it rscript = fabutils.find_cmd(env, "Rscript", "--version") if rscript: env.safe_run("%s %s" % (rscript, out_file)) else: env.logger.warn("Rscript not found; skipping install of R libraries.") env.safe_run("rm -f %s" % out_file)
def _download_broad_bundle(gid, bundle_version, name, ext): broad_fname = "{name}.{gid}.vcf{ext}".format(gid=gid, name=name, ext=ext) fname = broad_fname.replace(".{0}".format(gid), "").replace(".sites", "") base_url = "ftp://gsapubftp-anonymous:@ftp.broadinstitute.org/bundle/" + \ "{bundle}/{gid}/{fname}.gz".format( bundle=bundle_version, fname=broad_fname, gid=gid) if not env.safe_exists(fname): with warn_only(): dl = env.safe_run("wget -c %s" % base_url) if dl.succeeded: env.safe_run("gunzip %s" % os.path.basename(base_url)) env.safe_run("mv %s %s" % (broad_fname, fname)) else: env.logger.warn("dbSNP resources not available for %s" % gid) return fname
def _install_modules_configure_make(self, env): """ Differences from standard _configure_make(): - TODO: ./configure with destination modulefile directory on shared filesystem - add modules to profile for all users """ # currently putting module files in directory structure under env.system_install # it would be better to store them on a filesystem shared with worker nodes; this is harder env.safe_run("export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:%s/lib/pkgconfig && " \ "./configure --prefix=%s " % (env.system_install, env.system_install)) run('make') env.safe_sudo('make install') env.safe_sudo("cp etc/global/profile.modules /etc/profile.d/modules.sh") env.safe_sudo("ln -s {0} {1}".format(os.path.join(env.system_install, 'Modules', env.environment_modules_version), os.path.join(env.system_install, 'Modules', 'default')))
def download_dbsnp(genomes, bundle_version, dbsnp_version): """Download and install dbSNP variation data for supplied genomes. """ folder_name = "variation" genome_dir = os.path.join(env.data_files, "genomes") for (orgname, gid, manager) in ((o, g, m) for (o, g, m) in genomes if m.config.get("dbsnp", False)): vrn_dir = os.path.join(genome_dir, orgname, gid, folder_name) if not env.safe_exists(vrn_dir): env.safe_run('mkdir -p %s' % vrn_dir) with cd(vrn_dir): if gid in ["GRCh37", "hg19"]: _dbsnp_human(env, gid, manager, bundle_version, dbsnp_version) elif gid in ["mm10", "canFam3"]: _dbsnp_custom(env, gid)
def _download_lcrs(gid): """Retrieve low complexity regions from Heng Li's variant analysis paper. """ lcr_url = "https://github.com/lh3/varcmp/raw/master/scripts/LCR-hs37d5.bed.gz" out_file = "LCR.bed.gz" if not env.safe_exists(out_file): def _fix_chrom_names(env, orig_file): if gid == "hg19": convert_cmd = "| grep -v ^GL | grep -v ^NC | grep -v ^hs | sed 's/^/chr/'" else: assert gid == "GRCh37" convert_cmd = "" env.safe_run("zcat %s %s | bgzip -c > %s" % (orig_file, convert_cmd, out_file)) return out_file shared._remote_fetch(env, lcr_url, fix_fn=_fix_chrom_names) env.safe_run("tabix -p vcf -f %s" % out_file)
def _download_ancestral(env, gid, gconfig): """Download ancestral genome sequence for loss of function evaluation. Used by LOFTEE VEP plugin: https://github.com/konradjk/loftee """ base_url = "http://www.broadinstitute.org/~konradk/loftee/human_ancestor.fa.rz" if gid == "GRCh37": for ext in ["", ".fai"]: outfile = os.path.basename(base_url) + ext if not env.safe_exists(outfile): shared._remote_fetch(env, base_url + ext, samedir=True) elif gid == "hg19": # symlink to GRCh37 download for ext in ["", ".fai"]: outfile = os.path.basename(base_url) + ext if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile))
def _configure_and_install_native_packages(env, pkg_install): """ Setups up native package repositories, determines list of native packages to install, and installs them. """ home_dir = env.safe_run("echo $HOME") if home_dir: if env.shell_config.startswith("~"): nonhome = env.shell_config.split("~/", 1)[-1] env.shell_config = os.path.join(home_dir, nonhome) if env.distribution in ["debian", "ubuntu"]: _setup_apt_sources() _setup_apt_automation() _add_apt_gpg_keys() _apt_packages(pkg_install) elif env.distribution in ["centos", "scientificlinux"]: _setup_yum_sources() _yum_packages(pkg_install) if env.edition.short_name not in ["minimal"]: _setup_yum_bashrc() elif env.distribution == "arch": pass # No package support for Arch yet elif env.distribution == "macosx": brew.install_packages(env, pkg_install) else: raise NotImplementedError("Unknown target distribution")
def _configure_and_install_native_packages(env, pkg_install): """ Setups up native package repositories, determines list of native packages to install, and installs them. """ from fabric.api import env from cloudbio.package import brew from cloudbio.package.deb import (_apt_packages, _add_apt_gpg_keys, _setup_apt_automation, _setup_apt_sources) from cloudbio.package.rpm import (_yum_packages, _setup_yum_bashrc, _setup_yum_sources) home_dir = env.safe_run("echo $HOME") if home_dir: if env.shell_config.startswith("~"): nonhome = env.shell_config.split("~/", 1)[-1] env.shell_config = os.path.join(home_dir, nonhome) if env.distribution in ["debian", "ubuntu"]: _setup_apt_sources() _setup_apt_automation() _add_apt_gpg_keys() _apt_packages(pkg_install) elif env.distribution in ["centos", "scientificlinux"]: _setup_yum_sources() _yum_packages(pkg_install) _setup_yum_bashrc() elif env.distribution in ["arch", "suse"]: pass # No package support for Arch, SUSE yet elif env.distribution == "macosx": brew.install_packages(env, pkg_install) else: raise NotImplementedError("Unknown target distribution")
def _configure_and_install_native_packages(env, pkg_install): """ Setups up native package repositories, determines list of native packages to install, and installs them. """ env.logger.debug("Configure and install native packages for distribution: " + env.distribution) home_dir = env.safe_run("echo $HOME") if home_dir: if env.shell_config.startswith("~"): nonhome = env.shell_config.split("~/", 1)[-1] env.shell_config = os.path.join(home_dir, nonhome) if env.distribution in ["debian", "ubuntu"]: _setup_apt_sources() _setup_apt_automation() _add_apt_gpg_keys() _apt_packages(pkg_install) elif env.distribution in ["centos", "scientificlinux"]: _setup_yum_sources() _yum_packages(pkg_install) if env.edition.short_name not in ["minimal"]: _setup_yum_bashrc() elif env.distribution == "arch": pass # No package support for Arch yet elif env.distribution == "macosx": brew.install_packages(env, pkg_install) else: raise NotImplementedError("Unknown target distribution")
def _download_cosmic(gid): """Prepared versions of COSMIC, pre-sorted and indexed. utils/prepare_cosmic.py handles the work of creating the VCFs from standard COSMIC resources. """ base_url = "https://s3.amazonaws.com/biodata/variants" version = "v67_20131024" supported = ["hg19", "GRCh37"] if gid in supported: url = "%s/cosmic-%s-%s.vcf.gz" % (base_url, version, gid) gzip_fname = os.path.basename(url) fname = os.path.splitext(gzip_fname)[0] if not env.safe_exists(fname): if not env.safe_exists(gzip_fname): shared._remote_fetch(env, url) env.safe_run("gunzip %s" % fname) if not env.safe_exists(fname + ".idx"): shared._remote_fetch(env, url.replace(".gz", ".idx"))
def find_cmd(env, cmd, args): """Retrieve location of a command, checking in installation directory. """ local_cmd = os.path.join(env.system_install, "bin", cmd) for cmd in [local_cmd, cmd]: with quiet(): test_version = env.safe_run("%s %s" % (cmd, args)) if test_version.succeeded: return cmd return None
def local_sed(filename, before, after, limit='', use_sudo=False, backup='.bak', flags='', shell=False): """ Run a search-and-replace on ``filename`` with given regex patterns. From main fabric contrib, modified to handle local. """ func = env.safe_sudo if use_sudo else env.safe_run # Characters to be escaped in both for char in "/'": before = before.replace(char, r'\%s' % char) after = after.replace(char, r'\%s' % char) # Characters to be escaped in replacement only (they're useful in regexen # in the 'before' part) for char in "()": after = after.replace(char, r'\%s' % char) if limit: limit = r'/%s/ ' % limit context = { 'script': r"'%ss/%s/%s/%sg'" % (limit, before, after, flags), 'filename': '"$(echo %s)"' % filename, 'backup': backup } # Test the OS because of differences between sed versions with hide('running', 'stdout'): platform = env.safe_run("uname") if platform in ('NetBSD', 'OpenBSD', 'QNX'): # Attempt to protect against failures/collisions hasher = hashlib.sha1() hasher.update(env.host_string) hasher.update(filename) context['tmp'] = "/tmp/%s" % hasher.hexdigest() # Use temp file to work around lack of -i expr = r"""cp -p %(filename)s %(tmp)s \ && sed -r -e %(script)s %(filename)s > %(tmp)s \ && cp -p %(filename)s %(filename)s%(backup)s \ && mv %(tmp)s %(filename)s""" else: context['extended_regex'] = '-E' if platform == 'Darwin' else '-r' expr = r"sed -i%(backup)s %(extended_regex)s -e %(script)s %(filename)s" command = expr % context return func(command, shell=shell)
def r_library_installer(config): """Install R libraries using CRAN and Bioconductor. """ # Create an Rscript file with install details. out_file = "install_packages.R" if env.safe_exists(out_file): env.safe_run("rm -f %s" % out_file) env.safe_run("touch %s" % out_file) lib_loc = os.path.join(env.system_install, "lib", "R", "site-library") env.safe_sudo("mkdir -p %s" % lib_loc) repo_info = """ .libPaths(c("%s")) library(methods) cran.repos <- getOption("repos") cran.repos["CRAN" ] <- "%s" options(repos=cran.repos) source("%s") """ % (lib_loc, config["cranrepo"], config["biocrepo"]) env.safe_append(out_file, repo_info) install_fn = """ repo.installer <- function(repos, install.fn) { %s maybe.install <- function(pname) { if (!(pname %%in%% installed.packages())) install.fn(pname) } } """ if config.get("update_packages", True): update_str = """ update.packages(lib.loc="%s", repos=repos, ask=FALSE) """ % lib_loc else: update_str = "\n" env.safe_append(out_file, install_fn % update_str) std_install = """ std.pkgs <- c(%s) std.installer = repo.installer(cran.repos, install.packages) lapply(std.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran'])) env.safe_append(out_file, std_install) if len(config.get("bioc", [])) > 0: bioc_install = """ bioc.pkgs <- c(%s) bioc.installer = repo.installer(biocinstallRepos(), biocLite) lapply(bioc.pkgs, bioc.installer) """ % (", ".join('"%s"' % p for p in config['bioc'])) env.safe_append(out_file, bioc_install) # run the script and then get rid of it rscript = fabutils.find_cmd(env, "Rscript", "--version") if rscript: env.safe_sudo("%s %s" % (rscript, out_file)) else: env.logger.warn("Rscript not found; skipping install of R libraries.") env.safe_run("rm -f %s" % out_file)
def local_sed(filename, before, after, limit="", use_sudo=False, backup=".bak", flags="", shell=False): """ Run a search-and-replace on ``filename`` with given regex patterns. From main fabric contrib, modified to handle local. """ func = env.safe_sudo if use_sudo else env.safe_run # Characters to be escaped in both for char in "/'": before = before.replace(char, r"\%s" % char) after = after.replace(char, r"\%s" % char) # Characters to be escaped in replacement only (they're useful in regexen # in the 'before' part) for char in "()": after = after.replace(char, r"\%s" % char) if limit: limit = r"/%s/ " % limit context = { "script": r"'%ss/%s/%s/%sg'" % (limit, before, after, flags), "filename": '"$(echo %s)"' % filename, "backup": backup, } # Test the OS because of differences between sed versions with hide("running", "stdout"): platform = env.safe_run("uname") if platform in ("NetBSD", "OpenBSD", "QNX"): # Attempt to protect against failures/collisions hasher = hashlib.sha1() hasher.update(env.host_string) hasher.update(filename) context["tmp"] = "/tmp/%s" % hasher.hexdigest() # Use temp file to work around lack of -i expr = r"""cp -p %(filename)s %(tmp)s \ && sed -r -e %(script)s %(filename)s > %(tmp)s \ && cp -p %(filename)s %(filename)s%(backup)s \ && mv %(tmp)s %(filename)s""" else: context["extended_regex"] = "-E" if platform == "Darwin" else "-r" expr = r"sed -i%(backup)s %(extended_regex)s -e %(script)s %(filename)s" command = expr % context return func(command, shell=shell)
def r_library_installer(config): """Install R libraries using CRAN and Bioconductor. """ # Create an Rscript file with install details. out_file = "install_packages.R" if env.safe_exists(out_file): env.safe_run("rm -f %s" % out_file) env.safe_run("touch %s" % out_file) repo_info = """ cran.repos <- getOption("repos") cran.repos["CRAN" ] <- "%s" options(repos=cran.repos) source("%s") """ % (config["cranrepo"], config["biocrepo"]) env.safe_append(out_file, repo_info) install_fn = """ repo.installer <- function(repos, install.fn) { update.or.install <- function(pname) { if (pname %in% installed.packages()) update.packages(lib.loc=c(pname), repos=repos, ask=FALSE) else install.fn(pname) } } """ env.safe_append(out_file, install_fn) std_install = """ std.pkgs <- c(%s) std.installer = repo.installer(cran.repos, install.packages) lapply(std.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran'])) env.safe_append(out_file, std_install) if len(config.get("bioc", [])) > 0: bioc_install = """ bioc.pkgs <- c(%s) bioc.installer = repo.installer(biocinstallRepos(), biocLite) lapply(bioc.pkgs, bioc.installer) """ % (", ".join('"%s"' % p for p in config['bioc'])) env.safe_append(out_file, bioc_install) if config.get("update_packages", True): final_update = """ update.packages(repos=biocinstallRepos(), ask=FALSE) update.packages(ask=FALSE) """ env.safe_append(out_file, final_update) # run the script and then get rid of it env.safe_sudo("Rscript %s" % out_file) env.safe_run("rm -f %s" % out_file)
def _download_dbnsfp(env, gid, gconfig): """Download and prepare dbNSFP functional prediction resources if configured. Feeds into VEP for annotating VCF files: https://sites.google.com/site/jpopgen/dbNSFP https://github.com/ensembl-variation/VEP_plugins/blob/master/dbNSFP.pm """ version = "2.6" url = "http://dbnsfp.houstonbioinformatics.org/dbNSFPzip/dbNSFPv%s.zip" % version if gconfig.get("dbnsfp"): outfile = "dbNSFP_v%s.gz" % (version) if gid == "GRCh37": # download and prepare bgzipped output file if not env.safe_exists(outfile): zipfile = shared._remote_fetch(env, url, samedir=True) outdir = "dbNSFPv%s" % version env.safe_run("mkdir -p %s" % outdir) env.safe_run("unzip %s -d %s" % (zipfile, outdir)) env.safe_run("cat %s/dbNSFP*_variant.chr* | bgzip -c > %s" % (outdir, outfile)) env.safe_run("rm -f %s/* && rmdir %s" % (outdir, outdir)) env.safe_run("rm -f %s" % (zipfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("tabix -s 1 -b 2 -e 2 -c '#' %s" % outfile) elif gid == "hg19": # symlink to GRCh37 download if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("ln -sf ../../GRCh37/variation/%s.tbi %s.tbi" % (outfile, outfile))
def _download_broad_bundle(gid, bundle_version, name, ext): broad_fname = "{name}.{gid}.vcf{ext}".format(gid=gid, name=name, ext=ext) fname = broad_fname.replace(".{0}".format(gid), "").replace(".sites", "") + ".gz" base_url = "ftp://gsapubftp-anonymous:@ftp.broadinstitute.org/bundle/" + \ "{bundle}/{gid}/{fname}.gz".format( bundle=bundle_version, fname=broad_fname, gid=gid) # compress and prepare existing uncompressed versions if env.safe_exists(fname.replace(".vcf.gz", ".vcf")): env.safe_run("bgzip %s" % fname.replace(".vcf.gz", ".vcf")) env.safe_run("tabix -f -p vcf %s" % fname) # otherwise, download and bgzip and tabix index if not env.safe_exists(fname): out_file = shared._remote_fetch(env, base_url, allow_fail=True) if out_file: env.safe_run("gunzip -c %s | bgzip -c > %s" % (out_file, fname)) env.safe_run("tabix -f -p vcf %s" % fname) env.safe_run("rm -f %s" % out_file) else: env.logger.warn("dbSNP resources not available for %s" % gid) # clean up old files for ext in [".vcf", ".vcf.idx"]: if env.safe_exists(fname.replace(".vcf.gz", ext)): env.safe_run("rm -f %s" % (fname.replace(".vcf.gz", ext))) return fname
def _download_dbnsfp(env, gid, gconfig): """Download and prepare dbNSFP functional prediction resources if configured. Feeds into VEP for annotating VCF files: https://sites.google.com/site/jpopgen/dbNSFP https://github.com/ensembl-variation/VEP_plugins/blob/master/dbNSFP.pm """ version = "2.5" url = "http://dbnsfp.houstonbioinformatics.org/dbNSFPzip/dbNSFPv%s.zip" % version if gconfig.get("dbnsfp"): outfile = "dbNSFP_v%s.gz" % (version) if gid == "GRCh37": # download and prepare bgzipped output file if not env.safe_exists(outfile): zipfile = shared._remote_fetch(env, url, samedir=True) outdir = "dbNSFPv%s" % version env.safe_run("mkdir -p %s" % outdir) env.safe_run("unzip %s -d %s" % (zipfile, outdir)) env.safe_run("cat %s/dbNSFP*_variant.chr* | bgzip -c > %s" % (outdir, outfile)) env.safe_run("rm -f %s/* && rmdir %s" % (outdir, outdir)) env.safe_run("rm -f %s" % (zipfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("tabix -s 1 -b 2 -e 2 -c '#' %s" % outfile) elif gid == "hg19": # symlink to GRCh37 download if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("ln -sf ../../GRCh37/variation/%s.tbi %s.tbi" % (outfile, outfile))
def _bgzip_file(env, orig_file): env.safe_run("zcat %s | bgzip -c > %s" % (orig_file, out_file)) return out_file
def _download_qsignature(env, gid, gconfig): """Download qsignature position file to detect samples problems :param env :param gid: str genome id :param gconfig: :returns: NULL """ base_url = "http://downloads.sourceforge.net/project/adamajava/qsignature.tar.bz2" if gid == "GRCh37": outfile = "qsignature.vcf" if not env.safe_exists(outfile): zipfile = shared._remote_fetch(env, base_url, samedir=True) outdir = "qsignature" env.safe_run("mkdir -p %s" % outdir) env.safe_run("tar -jxf %s -C %s" % (zipfile, outdir)) env.safe_run("mv %s/qsignature_positions.txt %s" % (outdir, outfile)) env.safe_run("rm -rf %s" % outdir) env.safe_run("rm -rf %s" % zipfile) elif gid == "hg19": # symlink to GRCh37 download outfile = os.path.basename(base_url) if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile))
def _download_broad_bundle(gid, bundle_version, name, ext): # Broad bundle directories have uneven use of ".sites" in VCF files # only present in hg19 for non-dbSNP resources sites = ".sites" if gid == "hg19" and not name.startswith("dbsnp") else "" broad_fname = "{name}.{gid}{sites}.vcf{ext}".format(gid=gid, name=name, sites=sites, ext=ext) fname = broad_fname.replace(".{0}".format(gid), "").replace(".sites", "") + ".gz" base_url = "ftp://gsapubftp-anonymous:@ftp.broadinstitute.org/bundle/" + \ "{bundle}/{gid}/{fname}.gz".format( bundle=bundle_version, fname=broad_fname, gid=gid) # compress and prepare existing uncompressed versions if env.safe_exists(fname.replace(".vcf.gz", ".vcf")): env.safe_run("bgzip %s" % fname.replace(".vcf.gz", ".vcf")) env.safe_run("tabix -f -p vcf %s" % fname) # otherwise, download and bgzip and tabix index if not env.safe_exists(fname): out_file = shared._remote_fetch(env, base_url) env.safe_run("gunzip -c %s | bgzip -c > %s" % (out_file, fname)) env.safe_run("tabix -f -p vcf %s" % fname) env.safe_run("rm -f %s" % out_file) # clean up old files for ext in [".vcf", ".vcf.idx"]: if env.safe_exists(fname.replace(".vcf.gz", ext)): env.safe_run("rm -f %s" % (fname.replace(".vcf.gz", ext))) return fname
def _download_qsignature(env,gid,gconfig): """Download qsignature position file to detect samples problems :param env :param gid: str genome id :param gconfig: :returns: NULL """ base_url = "http://downloads.sourceforge.net/project/adamajava/qsignature.tar.bz2" if gid == "GRCh37": outfile = "qsignature.vcf" if not env.safe_exists(outfile): zipfile = shared._remote_fetch(env, base_url , samedir=True) outdir = "qsignature" env.safe_run("mkdir -p %s" % outdir) env.safe_run("tar -jxf %s -C %s" % (zipfile, outdir)) env.safe_run("mv %s/qsignature_positions.txt %s" % (outdir, outfile)) env.safe_run("rm -rf %s" % outdir) env.safe_run("rm -rf %s" % zipfile) elif gid == "hg19": # symlink to GRCh37 download outfile = os.path.basename(base_url) if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile))