def submit_load_dataset_to_es_job_v02(dataproc_cluster_name, start_with_step=0, stop_after_step=None, other_load_dataset_to_es_args=(), es_host='localhost', es_port='9200'): def abs_path(rel_path): return os.path.join(CUR_DIR, rel_path) # Must use absolute path because this script changes the directory all over the place :( pyfiles = ','.join([abs_path(f) for f in ['lib', '../hail_scripts']]) files = abs_path('configs/luigi.cfg') executable = abs_path('seqr_loading.py') if stop_after_step == 1: task = 'SeqrVCFToMTTask' else: task = 'SeqrMTToESTask --es-host %(es_host)s --es-port %(es_port)s' % locals( ) # submit job run(" ".join( map(str, [ "hailctl dataproc", "submit", "%(dataproc_cluster_name)s", "%(executable)s", "--pyfiles %(pyfiles)s", "--files %(files)s", "%(task)s --local-scheduler" ])) % locals())
def _create_dataproc_cluster(dataproc_cluster_name, genome_version, num_workers=2, num_preemptible_workers=12): run("python ./gcloud_dataproc/v01/create_cluster_GRCh%(genome_version)s.py %(dataproc_cluster_name)s %(num_workers)s %(num_preemptible_workers)s" % locals(), errors_to_ignore=["Already exists"])
def _create_dataproc_cluster_v02(dataproc_cluster_name, genome_version, num_workers=2, num_preemptible_workers=12): run(f"hailctl dataproc start %(dataproc_cluster_name)s --num-workers %(num_workers)s --pkgs luigi,google-api-python-client " "--num-secondary-workers %(num_preemptible_workers)s --max-idle 30m --vep GRCh%(genome_version)s" % locals(), errors_to_ignore=["Already exists"])
def download_and_import_latest_clinvar_vcf(hail_context, genome_version, subset=None): """Downloads the latest clinvar VCF from the NCBI FTP server, copies it to HDFS and returns the hdfs file path as well the clinvar release date that's specified in the VCF header. Args: genome_version (str): "37" or "38" subset (str): subset by interval (eg. "X:12345-54321") - useful for testing Returns: 2-tuple: (clinvar_vcf_hdfs_path, clinvar_release_date) """ if genome_version not in ["37", "38"]: raise ValueError("Invalid genome_version: " + str(genome_version)) # download vcf clinvar_url = CLINVAR_FTP_PATH.format(genome_version=genome_version) local_tmp_file_path = "/tmp/clinvar_grch{}.vcf.gz".format(genome_version) clinvar_vcf_hdfs_path = "/tmp/" + os.path.basename(local_tmp_file_path) print("\n==> downloading {}".format(clinvar_url)) run("wget {} -O {}".format(clinvar_url, local_tmp_file_path)) run("hdfs dfs -copyFromLocal -f file://{} {}".format( local_tmp_file_path, clinvar_vcf_hdfs_path)) clinvar_release_date = _parse_clinvar_release_date(local_tmp_file_path) # import vcf vds = hail_context.import_vcf( clinvar_vcf_hdfs_path, force_bgz=True, min_partitions=10000, drop_samples=True) #.filter_intervals(hail.Interval.parse("1-MT")) if subset: vds = vds.filter_intervals(hail.Interval.parse(subset)) vds = vds.repartition( 10000) # because the min_partitions arg doesn't work in some cases vds = vds.annotate_global_expr( 'global.sourceFilePath = "{}"'.format(clinvar_url)) vds = vds.annotate_global_expr( 'global.version = "{}"'.format(clinvar_release_date)) # handle multi-allelics vds = vds.split_multi() # for some reason, this additional filter is necessary to avoid # IllegalArgumentException: requirement failed: called altAllele on a non-biallelic variant vds = vds.filter_variants_expr("v.isBiallelic()", keep=True) print("\n==> downloaded clinvar vcf: ") pprint(vds.globals._attrs) return vds
def _create_temp_es_loading_nodes(settings): # make sure k8s cluster exists #run(" ".join([ # "gcloud container clusters create %(k8s_cluster_name)s", # "--machine-type %(CLUSTER_MACHINE_TYPE)s", # "--num-nodes 1", # "--scopes https://www.googleapis.com/auth/devstorage.read_write" #]) % locals(), errors_to_ignore=["Already exists"]) _set_k8s_context(settings) # add loading nodes run(" ".join([ "gcloud container node-pools create loading-cluster ", "--cluster %(CLUSTER_NAME)s", "--machine-type %(CLUSTER_MACHINE_TYPE)s", "--num-nodes %(ES_DATA_NUM_PODS)s", "--local-ssd-count 1", ]) % settings, errors_to_ignore=["Already exists"]) # deploy elasticsearch _process_kubernetes_configs( "create", settings=settings, config_paths=[ "./kubernetes/elasticsearch-sharded/es-data-stateless-local-ssd.yaml", ]) _wait_for_data_nodes_state("create", settings) # get ip address of loading nodes elasticsearch_ip_address = run( "kubectl get endpoints elasticsearch -o jsonpath='{.subsets[0].addresses[0].ip}'" ) logger.info("elasticsearch loading cluster IP address: {}".format( elasticsearch_ip_address)) if not elasticsearch_ip_address or not re.match( "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", elasticsearch_ip_address): logger.error("Invalid elasticsearch IP address: '{}'".format( elasticsearch_ip_address)) # add firewall rule to allow ingress firewall_rule_name = _compute_firewall_rule_name(settings["CLUSTER_NAME"]) source_range = "%s.%s.0.0/16" % tuple( elasticsearch_ip_address.split(".")[0:2]) for action in ["create", "update"]: run(( "gcloud compute firewall-rules %(action)s %(firewall_rule_name)s " "--description='Allow any machine in the project-default network to connect to elasticsearch loading cluster ports 9200, 9300'" "--network=default " "--allow=tcp:9200,tcp:9300 " "--source-ranges=%(source_range)s ") % locals(), errors_to_ignore=["already exists"]) return elasticsearch_ip_address
def _process_kubernetes_configs(action, config_paths, settings): for config_path in config_paths: # configure deployment dir output_dir = "/tmp/deployments/%(TIMESTAMP)s_%(CLUSTER_NAME)s" % settings process_jinja_template(".", config_path, settings, output_dir) config_path = os.path.join(output_dir, config_path) if action == "delete": run("kubectl delete -f %(config_path)s" % locals(), errors_to_ignore=["not found"]) elif action == "create": run("kubectl apply -f %(config_path)s" % locals(), errors_to_ignore=["already exists", "already allocated"])
def _enable_cluster_routing_rebalance(enable, dataproc_cluster_name, host, port): logger.info("==> %s cluster.routing.rebalance", "enable" if enable else "disable") run(" ".join(map(str, [ "./gcloud_dataproc/submit.py", "--hail-version 0.1", "--cluster", dataproc_cluster_name, "hail_scripts/elasticsearch_ops/cluster_routing_rebalance.py", "--host", host, "--port", port, "--enable" if enable else "--disable", ])))
def submit_load_dataset_to_es_job( dataproc_cluster_name, start_with_step=0, stop_after_step=None, other_load_dataset_to_es_args=()): # submit job run(" ".join(map(str, [ "python3 ./gcloud_dataproc/submit.py", "--hail-version 0.1", "--cluster %(dataproc_cluster_name)s", "hail_scripts/v01/load_dataset_to_es.py", "--stop-after-step %(stop_after_step)s " if stop_after_step is not None else "", "--start-with-step %(start_with_step)s ", ] + list(other_load_dataset_to_es_args))) % locals())
def _create_persistent_es_nodes(settings): # make sure cluster exists - create cluster with 1 node run( " ".join([ "gcloud container clusters create %(CLUSTER_NAME)s", "--machine-type %(CLUSTER_MACHINE_TYPE)s", "--num-nodes 1", # "--scopes https://www.googleapis.com/auth/devstorage.read_write" ]) % settings, errors_to_ignore=["Already exists"]) _set_k8s_context(settings) # create additional nodes run(" ".join([ "gcloud container node-pools create es-persistent-nodes", "--cluster %(CLUSTER_NAME)s", "--machine-type %(CLUSTER_MACHINE_TYPE)s", "--num-nodes " + str(int(settings.get("ES_DATA_NUM_PODS", 1)) - 1), ]) % settings, errors_to_ignore=["Already exists"]) # deploy elasticsearch _process_kubernetes_configs( "create", settings=settings, config_paths=[ #"./gcloud_dataproc/utils/elasticsearch_cluster/es-configmap.yaml", "./kubernetes/elasticsearch-sharded/es-namespace.yaml", "./kubernetes/elasticsearch-sharded/es-discovery-svc.yaml", "./kubernetes/elasticsearch-sharded/es-master.yaml", "./kubernetes/elasticsearch-sharded/es-svc.yaml", "./kubernetes/elasticsearch-sharded/es-kibana.yaml", ]) wait_until_pod_is_running("es-kibana") _process_kubernetes_configs( "create", settings=settings, config_paths=[ "./kubernetes/elasticsearch-sharded/es-client.yaml", "./kubernetes/elasticsearch-sharded/es-data-stateful.yaml", "./kubernetes/elasticsearch-sharded/es-data-svc.yaml", ]) _wait_for_data_nodes_state("create", settings, data_node_name="es-data")
def get_gcloud_file_stats(gs_path): if gs_path.endswith(".vds"): gs_path += "/metadata.json.gz" # set path to a file inside the .vds directory because gsutil stat works only on files. gsutil_stat_output = run("gsutil stat %(gs_path)s" % locals(), print_command=False, verbose=False, ignore_all_errors=True) """ Example gsutil stat output: Creation time: Fri, 09 Jun 2017 09:36:23 GMT Update time: Fri, 09 Jun 2017 09:36:23 GMT Storage class: REGIONAL Content-Length: 363620675 Content-Type: text/x-vcard Hash (crc32c): SWOktA== Hash (md5): fEdIumyOFR7HvULeAwXCwQ== ETag: CMae+J67sNQCEAE= Generation: 1497000983793478 Metageneration: 1 """ if not gsutil_stat_output: return None EMPTY_MATCH_OBJ = re.match("()", "") DATE_FORMAT = '%a, %d %b %Y %H:%M:%S %Z' creation_time = (re.search("Creation.time:[\s]+(.+)", gsutil_stat_output, re.IGNORECASE) or EMPTY_MATCH_OBJ).group(1) update_time = (re.search("Update.time:[\s]+(.+)", gsutil_stat_output, re.IGNORECASE) or EMPTY_MATCH_OBJ).group(1) file_size = (re.search("Content-Length:[\s]+(.+)", gsutil_stat_output, re.IGNORECASE) or EMPTY_MATCH_OBJ).group(1) file_md5 = (re.search("Hash (md5):[\s]+(.+)", gsutil_stat_output, re.IGNORECASE) or EMPTY_MATCH_OBJ).group(1) ctime = time.mktime(time.strptime(creation_time, DATE_FORMAT)) mtime = time.mktime(time.strptime(update_time, DATE_FORMAT)) return FileStats(ctime=ctime, mtime=mtime, size=file_size, md5=file_md5)
def _set_k8s_context(settings): run("gcloud container clusters get-credentials %(CLUSTER_NAME)s" % settings) run("kubectl config set-context $(kubectl config current-context) --namespace=%(NAMESPACE)s" % settings)
"localhost")) p.add_argument("--port", help="Elastisearch port", default="9200") p.add_argument( "--k8s-cluster-name", help="Specifies the kubernetes cluster name that hosts elasticsearch.", required=True) args = p.parse_args() client = ElasticsearchClient(args.host, args.port) wait_for_loading_shards_transfer(client, num_attempts=1) settings = _get_es_node_settings(args.k8s_cluster_name, args.num_temp_loading_nodes) _set_k8s_context(settings) _process_kubernetes_configs( "delete", settings=settings, config_paths=[ "./kubernetes/elasticsearch-sharded/es-data-stateless-local-ssd.yaml", ]) _wait_for_data_nodes_state("delete", settings) run("echo Y | gcloud container node-pools delete --cluster {} loading-cluster". format(args.k8s_cluster_name)) # delete firewall rule firewall_rule_name = _compute_firewall_rule_name(args.k8s_cluster_name) run("echo Y | gcloud compute firewall-rules delete {}s".format( firewall_rule_name))
type=int, help="Number of es nodes to create.", default=3) p.add_argument( "--k8s-cluster-name", help="Specifies the kubernetes cluster name that hosts elasticsearch.", required=True) args = p.parse_args() settings = _get_es_node_settings(args.k8s_cluster_name, args.num_nodes) load_settings([], settings) # make sure cluster exists - create cluster with 1 node run(" ".join([ "gcloud container clusters create %(CLUSTER_NAME)s", "--machine-type %(CLUSTER_MACHINE_TYPE)s", "--num-nodes 1", # "--scopes https://www.googleapis.com/auth/devstorage.read_write" ]) % settings) _set_k8s_context(settings) # create additional nodes run(" ".join([ "gcloud container node-pools create es-persistent-nodes", "--cluster %(CLUSTER_NAME)s", "--machine-type %(CLUSTER_MACHINE_TYPE)s", "--num-nodes " + str(int(settings.get("ES_DATA_NUM_PODS", 1)) - 1), ]) % settings) # deploy elasticsearch _process_kubernetes_configs(
import os import sys from kubernetes.shell_utils import simple_run as run if len(sys.argv) < 2: sys.exit("Must provide OMIM download key as command line arg (https://www.omim.org/downloads/)") omim_download_key = sys.argv[1] DOWNLOAD_PATH = "https://data.omim.org/downloads/%(omim_download_key)s/genemap2.txt" % locals() GCLOUD_BUCKET_PATH = "gs://seqr-reference-data/omim" filename = os.path.basename(DOWNLOAD_PATH) run("wget -O {filename} {DOWNLOAD_PATH}".format(**locals())) run("""/bin/bash -c "cat <(grep '^# Chromosome.*Genomic' {filename}) <(grep -v '^#' {filename}) > {filename}.temp" """.format(**locals())) run("mv {filename}.temp {filename}".format(**locals())) run("gsutil -m cp {filename} {GCLOUD_BUCKET_PATH}/{filename}".format(**locals())) run(" ".join([ "python gcloud_dataproc/v01/run_script.py", "--cluster omim", "hail_scripts/v01/convert_tsv_to_key_table.py", "--key-by 'Ensembl Gene ID'", "{GCLOUD_BUCKET_PATH}/{filename}" ]).format(**locals()))
#!/usr/bin/env python from kubernetes.shell_utils import simple_run as run for vcf_path in [ "gs://seqr-reference-data/GRCh37/TopMed/bravo-dbsnp-all.removed_chr_prefix.liftunder_GRCh37.vcf.gz", "gs://seqr-reference-data/GRCh38/TopMed/bravo-dbsnp-all.vcf.gz" ]: run(" ".join([ "python gcloud_dataproc/v01/run_script.py", "--cluster topmed", "hail_scripts/v01/convert_vcf_to_vds.py", "--sites-only", "{vcf_path}", ]).format(**locals()))
def main(): os.chdir(os.path.join(os.path.dirname(__file__), "..")) # get command-line args args, unparsed_args = init_command_line_args() # forward uparsed and other args to the load_dataset_to_es.py script load_dataset_to_es_args = unparsed_args load_dataset_to_es_args.extend([ "--host", args.host, "--port", args.port, "--genome-version", args.genome_version, "--project-guid", args.project_guid, "--use-temp-loading-nodes" if args.use_temp_loading_nodes else "", args.input_dataset, ]) # download .fam file? is_fam_file_specified = "--fam-file" in unparsed_args is_subset_samples_file_specified = "--subset-samples" in unparsed_args if args.download_fam_file and (not is_fam_file_specified or not is_subset_samples_file_specified): input_dataset_directory = os.path.dirname(args.input_dataset) or "." # prompt for seqr username and password seqr_username = args.seqr_username or input("seqr username: "******"seqr password: "******"gsutil cp %(fam_file_path)s %(fam_file_gcloud_path)s" % locals()) load_dataset_to_es_args.extend( ["--fam-file", fam_file_gcloud_path]) # upload subset-samples to vcf_directory if not is_subset_samples_file_specified: subset_samples_file_gcloud_path = os.path.join( input_dataset_directory, os.path.basename(subset_samples_file_path)) run("gsutil cp %(subset_samples_file_path)s %(subset_samples_file_gcloud_path)s" % locals()) load_dataset_to_es_args.extend( ["--subset-samples", subset_samples_file_gcloud_path]) # run pipeline with or without using a temp elasticsearch cluster for loading if args.use_temp_loading_nodes and (args.stop_after_step == None or args.stop_after_step > 1): # make sure kubectl is installed run("kubectl version --client") # run vep and compute derived annotations before create temp elasticsearch loading nodes if args.start_with_step <= 1: # make sure cluster exists _create_dataproc_cluster_v02( args.cluster_name, args.genome_version, num_workers=args.num_workers, num_preemptible_workers=args.num_preemptible_workers) submit_load_dataset_to_es_job_v02( args.cluster_name, start_with_step=args.start_with_step, stop_after_step=1, other_load_dataset_to_es_args=load_dataset_to_es_args, use_seqr_loading_optimized_pipeline=args. use_seqr_loading_optimized_pipeline) # create temp es nodes settings = _get_es_node_settings(args.k8s_cluster_name, args.num_temp_loading_nodes) ip_address = _create_es_nodes(settings) # _enable_cluster_routing_rebalance(False, args.cluster_name, ip_address, args.port) # make sure cluster exists _create_dataproc_cluster_v02( args.cluster_name, args.genome_version, num_workers=args.num_workers, num_preemptible_workers=args.num_preemptible_workers) # continue pipeline starting with loading steps, stream data to the new elasticsearch instance at ip_address submit_load_dataset_to_es_job_v02( args.cluster_name, start_with_step=max( 2, args.start_with_step), # start with step 2 or later stop_after_step=args.stop_after_step, other_load_dataset_to_es_args=load_dataset_to_es_args + ["--host %(ip_address)s" % locals()], es_host=ip_address, use_seqr_loading_optimized_pipeline=args. use_seqr_loading_optimized_pipeline) # _enable_cluster_routing_rebalance(True, args.cluster_name, ip_address, args.port) else: # make sure cluster exists _create_dataproc_cluster_v02( args.cluster_name, args.genome_version, num_workers=args.num_workers, num_preemptible_workers=args.num_preemptible_workers) submit_load_dataset_to_es_job_v02( args.cluster_name, start_with_step=args.start_with_step, stop_after_step=args.stop_after_step, other_load_dataset_to_es_args=load_dataset_to_es_args, use_seqr_loading_optimized_pipeline=args. use_seqr_loading_optimized_pipeline)
#!/usr/bin/env python from kubernetes.shell_utils import simple_run as run run(" ".join([ "python gcloud_dataproc/v01/run_script.py", "--cluster gnomad-coverage", "download_and_create_reference_datasets/v01/hail_scripts/write_gnomad_coverage_vds.py", ]))
#!/usr/bin/env python import os from kubernetes.shell_utils import simple_run as run DOWNLOAD_PATH = "ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3.1/functional_gene_constraint/fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt" GCLOUD_BUCKET_PATH = "gs://seqr-reference-data/gene_constraint" filename = os.path.basename(DOWNLOAD_PATH) run("wget -O {filename} {DOWNLOAD_PATH}".format(**locals())) run("""/bin/bash -c "cat {filename} | sed 's/\(ENST[0-9]*\)\.[0-9]/\\1/' > {filename}.temp" """ .format(**locals())) run("mv {filename}.temp {filename}".format(**locals())) run("gsutil -m cp {filename} {GCLOUD_BUCKET_PATH}/{filename}".format( **locals())) run(" ".join([ "python gcloud_dataproc/v01/run_script.py", "--cluster gene-constraint", "hail_scripts/v01/convert_tsv_to_key_table.py", "--key-by 'transcript'", "{GCLOUD_BUCKET_PATH}/{filename}", ]).format(**locals()))
#!/usr/bin/env python import argparse from kubernetes.shell_utils import simple_run as run genome_versions = ['37', '38'] p = argparse.ArgumentParser() args, unparsed_args = p.parse_known_args() script_args = " ".join(['"%s"' % arg for arg in unparsed_args]) run(" ".join([ "python gcloud_dataproc/v01/run_script.py", "--cluster gnomad", "download_and_create_reference_datasets/v01/hail_scripts/write_gnomad_vds.py", "{script_args}", ]).format(**locals()))
#!/usr/bin/env python from kubernetes.shell_utils import simple_run as run for dbnsfp_gene_table_path in [ "gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9_gene", "gs://seqr-reference-data/GRCh38/dbNSFP/v3.5/dbNSFP3.5_gene" ]: run(" ".join([ "python gcloud_dataproc/v01/run_script.py", "--cluster dbnsfp", "hail_scripts/v01/convert_tsv_to_key_table.py", "{dbnsfp_gene_table_path}" ]).format(**locals()))
#!/usr/bin/env python3 from kubernetes.shell_utils import simple_run as run for genome_version, vcf_path in [ ("37", "gs://seqr-reference-data/GRCh37/MPC/fordist_constraint_official_mpc_values.vcf.gz"), ("38", "gs://seqr-reference-data/GRCh38/MPC/fordist_constraint_official_mpc_values.liftover.GRCh38.vcf.gz"), ]: run(("python3 gcloud_dataproc/v02/run_script.py " "--cluster create-ht-mpc " "hail_scripts/v02/convert_vcf_to_hail.py " "--output-sites-only-ht " f"--genome-version {genome_version} " f"{vcf_path}"))
#!/usr/bin/env python3 from kubernetes.shell_utils import simple_run as run run(("python3 gcloud_dataproc/v02/run_script.py " "--cluster create-ht-cadd " "download_and_create_reference_datasets/v02/hail_scripts/write_cadd_ht.py" ))
#!/usr/bin/env python3 import argparse from kubernetes.shell_utils import simple_run as run parser = argparse.ArgumentParser() parser.add_argument('-b', '--build', help='Reference build, 37 or 38', choices=["37", "38"], required=True) args = parser.parse_args() run(( "python3 gcloud_dataproc/v02/run_script.py " "--cluster create-ht-combined-reference-data " "download_and_create_reference_datasets/v02/hail_scripts/write_combined_reference_data_ht.py " f"--build {args.build}"))
#!/usr/bin/env python from kubernetes.shell_utils import simple_run as run run(" ".join([ "python gcloud_dataproc/v01/run_script.py", "--cluster cadd", "download_and_create_reference_datasets/v01/hail_scripts/write_cadd_vds.py", ]))
import sys from kubernetes.shell_utils import simple_run as run unique_id = random.randint(10**5, 10**6 - 1) random_cluster_name = "without-vep-%s" % unique_id p = argparse.ArgumentParser() p.add_argument("-c", "--cluster", default=random_cluster_name) p.add_argument("script") args, unparsed_args = p.parse_known_args() cluster_name = args.cluster script = args.script script_args = " ".join(['"%s"' % arg for arg in unparsed_args]) os.chdir(os.path.join(os.path.dirname(__file__), "../..")) run("./gcloud_dataproc/v02/create_cluster_without_VEP.py %(cluster_name)s 2 12" % locals()) if "-h" in sys.argv or "--help" in sys.argv: run("python %(script)s -h" % locals()) sys.exit(0) run(("time ./gcloud_dataproc/submit.py " "--hail-version 0.2 " "--cluster %(cluster_name)s " "%(script)s %(script_args)s") % locals())
#!/usr/bin/env python """ This script creates snapshots of disks bound to PersistentVolumeClaims in the "current" kubernetes cluster. """ import argparse import time from kubernetes.shell_utils import run p = argparse.ArgumentParser() p.add_argument("--zone", help="gcloud zone", default="us-central1-b") args = p.parse_args() output = run("kubectl get pvc -o jsonpath='{.items[*].spec.volumeName}'") disk_names = output.split() timestamp = time.strftime("%Y%m%d-%H%M%S") snapshot_names = ["snap-%s--%s" % (timestamp, disk_name) for disk_name in disk_names] disk_names = " ".join(disk_names) snapshot_names = ",".join(snapshot_names) zone = args.zone run("gcloud compute disks snapshot %(disk_names)s --snapshot-names %(snapshot_names)s --zone=%(zone)s" % locals())
import sys from kubernetes.shell_utils import simple_run as run unique_id = random.randint(10**5, 10**6 - 1) random_cluster_name = "vep-grch37-%s" % unique_id p = argparse.ArgumentParser() p.add_argument("-c", "--cluster", default=random_cluster_name) p.add_argument("script") args, unparsed_args = p.parse_known_args() cluster_name = args.cluster script = args.script script_args = " ".join(['"%s"' % arg for arg in unparsed_args]) os.chdir(os.path.join(os.path.dirname(__file__), "../..")) run("python gcloud_dataproc/v01/create_cluster_GRCh37.py %(cluster_name)s 2 12" % locals()) if "-h" in sys.argv or "--help" in sys.argv: run("python %(script)s -h" % locals()) sys.exit(0) run(("time ./gcloud_dataproc/submit.py " "--hail-version 0.1 " "--cluster %(cluster_name)s " "%(script)s %(script_args)s") % locals())
#!/usr/bin/env python3 from kubernetes.shell_utils import simple_run as run run(( "python3 gcloud_dataproc/v02/run_script.py " "--cluster create-gnomad-38-hts " "download_and_create_reference_datasets/v02/hail_scripts/write_gnomad_38_hts.py" ))
def _make_disks(settings, es_disk_snapshots=None): """Create persistent disks from snapshots Args: es_disk_snapshots (list): optional list of snapshot names """ # create disks from snapshots created_disks = [] if es_disk_snapshots: for i, snapshot_name in enumerate(es_disk_snapshots): disk_name = "es-data-%s--%d" % ( settings["CLUSTER_NAME"], i ) # time.strftime("%y%m%d-%H%M%S") - make the timestamp year-month-day so a bunch of disks don't get created accidentally run(" ".join([ "gcloud compute disks create " + disk_name, "--type pd-ssd", "--source-snapshot " + snapshot_name, ]) % settings, errors_to_ignore=["lready exists"]) disk_size = settings[ "ELASTICSEARCH_DISK_SIZE"] # TODO GET SNAPSHOT DISK SIZE from gcloud compute disks describe ... created_disks.append((disk_name, disk_size)) else: for i in range(settings["ES_NUM_PERSISTENT_NODES"]): disk_name = "es-data-%s--%d" % (settings["CLUSTER_NAME"], i) run(" ".join([ "gcloud compute disks create " + disk_name, "--type pd-ssd", "--size %(ELASTICSEARCH_DISK_SIZE)s", ]) % settings, errors_to_ignore=["lready exists"]) created_disks.append( (disk_name, settings["ELASTICSEARCH_DISK_SIZE"])) # create PersistentVolume objects for disk namespace = settings["NAMESPACE"] for i, (existing_disk_name, elasticsearch_disk_size) in enumerate(created_disks): with tempfile.NamedTemporaryFile("w") as f: f.write("""apiVersion: v1 kind: PersistentVolume metadata: name: %(existing_disk_name)s namespace: %(namespace)s spec: capacity: storage: %(elasticsearch_disk_size)s accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Retain storageClassName: ssd-storage-class gcePersistentDisk: fsType: ext4 pdName: %(existing_disk_name)s """ % locals()) f.flush() file_path = f.name run("kubectl create -f %(file_path)s" % locals(), print_command=True, errors_to_ignore=["already exists"])
#!/usr/bin/env python import argparse from kubernetes.shell_utils import simple_run as run genome_versions = ['37', '38'] p = argparse.ArgumentParser() p.add_argument("-g", "--genome-version", help="Genome build: 37 or 38", choices=genome_versions) args, unparsed_args = p.parse_known_args() script_args = " ".join(['"%s"' % arg for arg in unparsed_args]) cluster_name = 'create-all-reference-data-vds' if args.genome_version: cluster_name += "-grch" + args.genome_version genome_versions = [args.genome_version] for genome_version in genome_versions: run(" ".join([ "python gcloud_dataproc/v01/run_script.py", "--cluster {cluster_name}", "download_and_create_reference_datasets/v01/hail_scripts/combine_all_variant_level_reference_data.py", "--genome-version {genome_version} {script_args}", ]).format(**locals()))