def stress(): b = hb.Batch(name='stress', backend=hb.ServiceBackend(billing_project='hail'), default_image=DOCKER_ROOT_IMAGE) for i in range(100): j = b.new_job(name=f'parent_{i}') d = random.choice(range(4)) if flip(0.2): j.command(f'sleep {d}; exit 1') else: j.command(f'sleep {d}; echo parent {i}') for k in range(10): d = random.choice(range(4)) c = b.new_job(name=f'child_{i}_{k}').command( f'sleep {d}; echo child {i} {k}') c.depends_on(j) if flip(0.1): c._always_run = True if flip(0.01): c._machine_type = 'n1-standard-1' if flip(0.5): c._preemptible = False b.run(open=False, wait=False)
def main(df_x_path, df_y_path, output_path, python_image): backend = hb.ServiceBackend() b = hb.Batch(name='rf-loo', default_python_image=python_image) with hl.hadoop_open(df_y_path) as f: local_df_y = pd.read_table(f, header=0, index_col=0) df_x_input = b.read_input(df_x_path) df_y_input = b.read_input(df_y_path) results = [] for window in local_df_y.index.to_list(): checkpoint = checkpoint_path(window) if hl.hadoop_exists(checkpoint): result = b.read_input(checkpoint) results.append(result) continue j = b.new_python_job() result = j.call(random_forest, df_x_input, df_y_input, window) tsv_result = j.call(as_tsv, result) tsv_result = tsv_result.as_str() b.write_output(tsv_result, checkpoint) results.append(tsv_result) output = hb.concatenate(b, results) b.write_output(output, output_path) b.run(wait=False) backend.close()
def run_batch(args, batch_name=None): """Wrapper around creating, running, and then closing a Batch run. :param args: Parsed args from the ArgumentParser created via the init_arg_parser method :param batch_name: (optional) batch label which will show up in the Batch web UI Usage: with run_batch(args) as batch: ... batch job definitions ... """ if args.local: backend = (hb.LocalBackend() if args.raw else hb.LocalBackend( gsa_key_file=args.gsa_key_file)) else: backend = hb.ServiceBackend(billing_project=args.batch_billing_project, bucket=args.batch_temp_bucket) try: batch = hb.Batch(backend=backend, name=batch_name) batch.batch_utils_temp_bucket = args.batch_temp_bucket yield batch # returned to with ... as batch: # run on end of with..: block batch.run(dry_run=args.dry_run, verbose=args.verbose) finally: if isinstance(backend, hb.ServiceBackend): backend.close()
def validate_all_objects_in_directory(gs_dir): """Validate files with MD5s in the provided gs directory""" backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET'), ) b = hb.Batch('validate_md5s', backend=backend) client = storage.Client() if not gs_dir.startswith('gs://'): raise ValueError(f'Expected GS directory, got: {gs_dir}') bucket_name, *components = gs_dir[5:].split('/') blobs = client.list_blobs(bucket_name, prefix='/'.join(components)) files: Set[str] = {f'gs://{bucket_name}/{blob.name}' for blob in blobs} for obj in files: if obj.endswith('.md5'): continue if f'{obj}.md5' not in files: continue job = b.new_job(f'validate_{os.path.basename(obj)}') job.image(DRIVER_IMAGE) validate_md5(job, obj) b.run(wait=False)
def main(): """ Create a Hail Batch analysis-runner helper creates a DataProc cluster, add the job Set off the batch """ service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET'), ) # create a hail batch batch = hb.Batch(name='cohort_mt_extraction', backend=service_backend) _my_job = dataproc.hail_dataproc_job( batch=batch, script=' '.join(sys.argv[1:]), max_age='4h', job_name='extract_from_cohort_mt', num_secondary_workers=4, cluster_name='cohort_mt_extraction with max-age=4h', ) # noqa: F841 batch.run(wait=False)
def run_concat(backend: Union[hb.ServiceBackend, hb.LocalBackend] = None, input_vcf: str = None, output_type: str = 'vcf', cpu: int = 16, memory: str = 'standard', out_dir: str = None): print(f'\n2. CONCAT {input_vcf}\n') vcf_filebase = get_vcf_filebase(input_vcf) concat_b = hb.Batch(backend=backend, name=f'concat-imputed-chunks-{vcf_filebase}') # get the regions so we can map each file to its specific region regions = pd.read_csv(f'{out_dir}/GWASpy/{vcf_filebase}/Imputation/imputation.regions', sep='\t', names=['reg', 'ind']) regions_dict = pd.Series(regions.reg.values, index=regions.ind).to_dict() imputed_vcfs_chunks = hl.utils.hadoop_ls(f'{out_dir}/GWASpy/{vcf_filebase}/Imputation/imputed_chunks/*.bcf') for i in range(1, 24): if i == 23: chrom = 'chrX' else: chrom = f'chr{i}' chrom_phased_files_to_concat = [] for file in imputed_vcfs_chunks: f = file['path'] vcf_basename = get_vcf_filebase(f) file_index = int(vcf_basename.split('.')[-4]) file_region = regions_dict[file_index] map_chrom = file_region.split(':')[0] if map_chrom == chrom: chrom_phased_files_to_concat.append(f) # naturally sort the list of files to merge from gwaspy.utils.natural_sort import natural_keys chrom_phased_files_to_concat.sort(key=natural_keys) concat_vcfs(b=concat_b, vcfs_to_merge=chrom_phased_files_to_concat, vcf_basename=vcf_filebase, output_type=output_type, chrom=chrom, cpu=cpu, memory=memory, out_dir=out_dir) concat_b.run()
def main(script: str, mt: str): """ runs a script inside dataproc to execute VEP :param script: str, the path to the VEP main script """ service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET'), ) # create a hail batch batch = hb.Batch(name='run_vep_in_dataproc_cluster', backend=service_backend) job = dataproc.hail_dataproc_job( batch=batch, worker_machine_type='n1-highmem-8', worker_boot_disk_size=200, secondary_worker_boot_disk_size=200, script=f'{script} --mt {mt}', max_age='12h', init=[ 'gs://cpg-reference/hail_dataproc/install_common.sh', 'gs://cpg-reference/vep/vep-GRCh38.sh', ], job_name='run_vep', num_secondary_workers=20, num_workers=2, cluster_name='run vep', ) job.cpu(2) job.memory('standard') job.storage('20G') batch.run(wait=False)
def main(): use_tabix = True hl.init(log='/Users/nbaya/Downloads/get_chr_pos.log') backend = hb.ServiceBackend(billing_project='ukb_diverse_pops', bucket='ukbb-diverse-temp-30day/nb-batch-tmp') b = hb.Batch(name='get_chr_pos', backend=backend, default_image='gcr.io/ukbb-diversepops-neale/nbaya_tabix:latest', default_storage='2G', default_cpu=1) paths = get_paths() for path in paths: print(path) annotate_chr_pos(b=b, path=path, use_tabix=use_tabix) b.run(open=True) backend.close()
to the gs://cpg-fewgenomes-main bucket. """ import os import csv import hailtop.batch as hb # OUTPUT gets propagated from the analysis-runner cli to the server output_bucket = os.getenv('OUTPUT') assert output_bucket and output_bucket.startswith('gs://cpg-fewgenomes-main/') # input CSV contains 3 columns with sample name, file type, full GCS file path INPUT_FILELIST = './data/filtered65.csv' ANALYSIS_RUNNER_IMAGE = 'australia-southeast1-docker.pkg.dev/analysis-runner/images/driver:45c3f8125e300cd70bb790e32d96816f003a7af2-hail-0.2.64.devcb1c44c7b529' service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET') ) b = hb.Batch(backend=service_backend, name='copy-crams') # Create Hail Batch jobs to copy CRAMs and indexes listed in CSV file to output bucket with open(INPUT_FILELIST, newline='') as csvfile: reader = csv.DictReader(csvfile) for row in reader: j_copy = b.new_job(name=f'copy-{row["sample_name"]}-{row["ftype"]}') (j_copy.image(ANALYSIS_RUNNER_IMAGE) .command(f'gcloud -q auth activate-service-account --key-file=/gsa-key/key.json') .command(f'gsutil cp {row["fname"]} {output_bucket}')) b.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET') ) batch = hb.Batch(name='nfe-pca-no-outliers', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_tob_wgs_nfe_pca_no_outliers.py', max_age='4h', num_secondary_workers=20, init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name=f'nfe-pca-no-outliers', worker_boot_disk_size=200, ) batch.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='pca_combined_tob_snp_chip', backend=service_backend) dataproc.hail_dataproc_job( batch, 'plot_pca_tob_wgs_snp_chip_datasets.py', max_age='1h', packages=['click', 'selenium'], init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name=f'pca_combined_tob_snp_chip', ) batch.run()
import os import hailtop.batch as hb backend = hb.ServiceBackend( billing_project='leonhardgruenschloss-trial', bucket='leo-tmp-au') b = hb.Batch(backend=backend, name='outer') j = b.new_job(name='launch-inner') j.image(f'gcr.io/{os.getenv("GCP_PROJECT")}/hail-batch-nested:latest') j.command('python3 inner.py') b.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name=f'snp_chip_variants_pca', backend=service_backend) dataproc.hail_dataproc_job( batch, 'snp_chip_generate_pca.py', max_age='12h', num_secondary_workers=20, packages=['click'], init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name=f'snp_chip_variants_pca', ) batch.run()
def submit(hail_code: Commit, benchmark_code: Commit, test_names: Set[str], n_replicates: int, n_iters: int): sync_check_shell(benchmark_code.checkout_script()) sys.path.insert(0, f'{benchmark_code.repo_dir()}/benchmark/python/benchmark_hail') importlib.invalidate_caches # pylint: disable=pointless-statement from benchmark_hail.run.resources import all_resources # pylint: disable=import-error, import-outside-toplevel from benchmark_hail.run.utils import list_benchmarks # pylint: disable=import-error, import-outside-toplevel output_file = f'gs://hail-benchmarks-2/benchmark/{hail_code.sha}-{benchmark_code.sha}.json' b = hb.Batch(name=f'benchmark-{hail_code.sha}', backend=hb.ServiceBackend(billing_project='hail'), default_image=BENCHMARK_IMAGE, default_cpu='2', attributes={'output_file': output_file, 'n_replicates': str(n_replicates), 'n_iters': str(n_iters), 'image': str(BENCHMARK_IMAGE), 'hail_code': str(hail_code), 'benchmark_code': str(benchmark_code)}) build_hail = b.new_job('build_hail_wheel') build_hail.command(f''' set -ex { hail_code.checkout_script() } cd hail time ./gradlew --version time make wheel time (cd python && zip -r hail.zip hail hailtop) (cd build/deploy/dist/ && tar -cvf wheel-container.tar hail-*-py3-none-any.whl) cp build/deploy/dist/hail-*-py3-none-any.whl {build_hail.wheel} ''') build_benchmark = b.new_job('build_benchmark_wheel') build_benchmark.command(f''' set -ex {benchmark_code.checkout_script()} make -C hail python/hail/hail_pip_version export HAIL_VERSION=$(cat hail/python/hail/hail_pip_version) export HAIL_BENCHMARK_VERSION=$HAIL_VERSION cd benchmark/python/ && python3 setup.py -q bdist_wheel python3 -m pip -q install dist/benchmark_hail-$HAIL_VERSION-py3-none-any.whl cp dist/benchmark_hail-$HAIL_VERSION-py3-none-any.whl {build_benchmark.wheel} ''') resource_jobs = {} for r in all_resources: j = b.new_job(f'create_resource_{r.name()}').cpu(4) j.command(f'mv {build_hail.wheel} hail--py3-none-any.whl') j.command('pip install hail--py3-none-any.whl') j.command(f'mv {build_benchmark.wheel} benchmark_hail-$HAIL_VERSION-py3-none-any.whl') j.command('pip install benchmark_hail-$HAIL_VERSION-py3-none-any.whl') j.command(f'hail-bench create-resources --data-dir benchmark-resources --group {r.name()}') j.command(f"time tar -cf {r.name()}.tar benchmark-resources/{r.name()} --exclude='*.crc'") j.command(f'ls -lh {r.name()}.tar') j.command(f'mv {r.name()}.tar {j.ofile}') resource_jobs[r] = j all_benchmarks = list_benchmarks() assert len(all_benchmarks) > 0 all_output = [] n_passed_filter = 0 job_fs = [] for benchmark in all_benchmarks: if benchmark.name in test_names: n_passed_filter += 1 for replicate in range(n_replicates): job_fs.append((benchmark.name, replicate, benchmark.groups)) log.info(f'generating {n_passed_filter} * {n_replicates} = {n_passed_filter * n_replicates} individual benchmark jobs') random.shuffle(job_fs) for name, replicate, groups in job_fs: j = b.new_job(name=f'{name}_{replicate}') j.command(f'mv {build_hail.wheel} hail--py3-none-any.whl') j.command('pip install hail--py3-none-any.whl') j.command(f'mv {build_benchmark.wheel} benchmark_hail--py3-none-any.whl') j.command('pip install benchmark_hail--py3-none-any.whl') j.command('mkdir -p benchmark-resources') for resource_group in groups: resource_job = resource_jobs[resource_group] j.command(f'mv {resource_job.ofile} benchmark-resources/{resource_group.name()}.tar') j.command(f'time tar -xf benchmark-resources/{resource_group.name()}.tar') j.command(f'MKL_NUM_THREADS=1' f'OPENBLAS_NUM_THREADS=1' f'OMP_NUM_THREADS=1' f'VECLIB_MAXIMUM_THREADS=1' f'PYSPARK_SUBMIT_ARGS="--driver-memory 6G pyspark-shell" ' f'hail-bench run -o {j.ofile} -n {n_iters} --data-dir benchmark-resources -t {name}') all_output.append(j.ofile) combine_branch_factor = int(os.environ.get('BENCHMARK_BRANCH_FACTOR', 32)) phase_i = 1 while len(all_output) > combine_branch_factor: new_output = [] job_i = 1 i = 0 while i < len(all_output): combine = b.new_job(f'combine_output_phase{phase_i}_job{job_i}') combine.command(f'mv {build_hail.wheel} hail--py3-none-any.whl') combine.command('pip install hail--py3-none-any.whl') combine.command(f'mv {build_benchmark.wheel} benchmark_hail--py3-none-any.whl') combine.command('pip install benchmark_hail--py3-none-any.whl') combine.command( f'hail-bench combine -o {combine.ofile} ' + ' '.join(all_output[i:i + combine_branch_factor])) new_output.append(combine.ofile) i += combine_branch_factor job_i += 1 phase_i += 1 all_output = new_output combine = b.new_job('final_combine_output') combine.command(f'mv {build_hail.wheel} hail--py3-none-any.whl') combine.command('pip install hail--py3-none-any.whl') combine.command(f'mv {build_benchmark.wheel} benchmark_hail--py3-none-any.whl') combine.command('pip install benchmark_hail--py3-none-any.whl') combine.command(f'hail-bench combine -o {combine.ofile} ' + ' '.join(all_output)) combine.command(f'cat {combine.ofile}') log.info(f'writing output to {output_file}') b.write_output(combine.ofile, output_file) b.run()
SHA = sys.argv[3] N_REPLICATES = int(sys.argv[4]) N_ITERS = int(sys.argv[5]) labeled_sha = SHA label = os.environ.get('BENCHMARK_LABEL') if label: labeled_sha = f'{labeled_sha}-{label}' output_file = os.path.join(BUCKET_BASE, f'{labeled_sha}.json') b = hb.Batch(name=f'benchmark-{labeled_sha}', backend=hb.ServiceBackend(billing_project='hail'), default_image=BENCHMARK_IMAGE, default_storage='100G', default_memory='7G', default_cpu=2, attributes={ 'output_file': output_file, 'n_replicates': str(N_REPLICATES), 'n_iters': str(N_ITERS), 'image': str(BENCHMARK_IMAGE) }) resource_tasks = {} for r in all_resources: j = b.new_job(f'create_resource_{r.name()}').cpu(4) j.command( f'hail-bench create-resources --data-dir benchmark-resources --group {r.name()}' ) j.command( f"time tar -cf {r.name()}.tar benchmark-resources/{r.name()} --exclude='*.crc'" )
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='new-variants-plot-pca', backend=service_backend) dataproc.hail_dataproc_job( batch, 'plot_pca_and_loadings.py', max_age='2h', packages=['selenium'], init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name='new-variants-plot-pca', ) batch.run()
parser.add_argument('--scatter-count', type=int, default=50) parser.add_argument('--out-dir', default='gs://african-seq-data') args = parser.parse_args() if args.local: backend = hb.LocalBackend() else: backend = hb.ServiceBackend(billing_project=args.billing_project, bucket=args.bucket) ref_fasta_size = bytes_to_gb(args.ref_fasta) ref_dict_size = bytes_to_gb(args.ref_dict) ref_ind_size = bytes_to_gb(args.ref_index) scatter = hb.Batch(backend=backend, name='scatter-interval-list') calling_interval_list = scatter.read_input(args.calling_interval_list) scatter_intervals = scatter_interval_list( b=scatter, interval_list_file=calling_interval_list, out_dir=args.out_dir, scatter_count=args.scatter_count) scatter.run() interval_files = hl.utils.hadoop_ls(f'{args.out_dir}/scatter-intervals/**') var_call = hb.Batch(backend=backend, name='variant-calling') fasta = var_call.read_input_group( **{ 'fasta': args.ref_fasta, 'fasta.fai': args.ref_index,
"https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh37/whole_genome_SNVs.tsv.gz", "indels_url": "https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh37/InDels.tsv.gz", "version": "v1.6" }, "GRCh38": { "snvs_url": "https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz", "indels_url": "https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/gnomad.genomes.r3.0.indel.tsv.gz", "version": "v1.6" } } backend = hb.ServiceBackend(billing_project="hail-datasets-api") batch = hb.Batch(backend=backend, name=name) for build in ["GRCh37", "GRCh38"]: snvs_url = builds[build]["snvs_url"] indels_url = builds[build]["indels_url"] version = builds[build]["version"] j = batch.new_job(name=f"{name}_{version}_{build}") j.image("gcr.io/broad-ctsa/datasets:050521") j.command( "gcloud -q auth activate-service-account --key-file=/gsa-key/key.json") j.command( f"wget -c -O - {snvs_url} {indels_url} | " "zcat | " "grep -v '^#' | " """awk -v FS=$'\t' -v OFS=$'\t' 'BEGIN {print "chromosome","position","ref","alt","raw_score","PHRED_score"} {print $0}' | """ "bgzip -c | "
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='king-nfe', backend=service_backend) dataproc.hail_dataproc_job( batch, f'king_nfe.py', max_age='12h', num_secondary_workers=20, init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name=f'king-nfe', worker_boot_disk_size=200, ) batch.run()
"""Entry point for the analysis runner.""" import os import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET') ) batch = hb.Batch(name='densified pca all samples', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_tob_wgs_plot_pca_all_samples.py --output={OUTPUT}', max_age='1h', packages=['click', 'selenium'], init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'], job_name=f'densified pca all samples', ) batch.run()
import os import sys import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') POP = sys.argv[1] if len(sys.argv) > 1 else 'nfe' service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name=f'{POP} project snp-chip', backend=service_backend) dataproc.hail_dataproc_job( batch, f'project_snp_chip_data.py --output={OUTPUT} --pop {POP}', max_age='5h', packages=['click', 'selenium'], init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'], job_name=f'{POP}-project-snp-chip', ) batch.run()
# TODO: Fill in the location of your demo image in GCR # Fill this in when running LD-clumping on the service # This should look something like gcr.io/atgu-training/batch-demo-<user>:latest BATCH_DEMO_IMAGE = 'gcr.io/atgu-training/batch-demo-jigold:latest' # TODO: Fill in the name of <YOUR_BILLING_PROJECT> and <YOUR_BUCKET> # Fill this in when running LD-clumping on the service # The billing project for the workshop is 'atgu-welcome-workshop'. # The bucket is the name of the bucket that you configured your service account to have access to. Do not include the gs:// # In the future, you can use hailctl config to set defaults for these parameters # `hailctl config set batch/billing_project my-billing-project` # `hailctl config set batch/bucket my-bucket backend = hb.ServiceBackend(billing_project='atgu-welcome-workshop', bucket='batch-tmp-jigold') batch = hb.Batch(backend=backend, name='clumping-demo') # Define inputs vcf = batch.read_input(args.vcf) # TODO: We want to read the input file for the phenotypes and make it an InputResourceFile # look at the vcf file above for an example of creating an InputResourceFile. The phenotypes # file is passed as `args.phenotypes` phenotypes = batch.read_input(args.phenotypes) # QC and compute gwas assoc results # TODO: Fill in the argument parameters to the `run_gwas` function # This will add a new job to the Batch `batch` that runs a GWAS in Hail # and exports the dataset to PLINK format. It also takes as arguments the batch to use, the name # of the Docker image to use, a VCF file and a file with the phenotypes. gwas = run_gwas(batch, BATCH_DEMO_IMAGE, vcf, phenotypes) # Run PLINK clumping once per chromosome
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv("HAIL_BILLING_PROJECT"), bucket=os.getenv("HAIL_BUCKET")) batch = hb.Batch(name="calculate-maf", backend=service_backend) dataproc.hail_dataproc_job( batch, f"calculate_maf.py", max_age="12h", num_secondary_workers=20, init=["gs://cpg-reference/hail_dataproc/install_common.sh"], job_name=f"calculate_maf", worker_boot_disk_size=200, ) batch.run()
"""Run gnomad_loadings_90k_liftover.py using the analysis runner.""" import os import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='gnomad loadings liftover', backend=service_backend) dataproc.hail_dataproc_job( batch, f'gnomad_loadings_90k_liftover.py --output={OUTPUT}', max_age='1h', packages=['click'], job_name='gnomad-loadings-liftover', ) batch.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='plot_snp_chip_pca', backend=service_backend) dataproc.hail_dataproc_job( batch, 'plot_tob_snp_chip_pca_only.py', max_age='1h', packages=['selenium'], init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name=f'plot_snp_chip_pca', ) batch.run()
"""Entry point for the analysis runner.""" import os import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='densified loadings nfe', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_tob_wgs_plot_loadings_nfe.py --output={OUTPUT}', max_age='1h', packages=['click', 'selenium'], init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'], job_name=f'densified loadings nfe', ) batch.run()
f = b.read_input(ss_path) j = b.new_job(name=fname.split('.')[0]) j.command(f'tabix -s 1 -b 2 -e 2 -c chr {f}' ) # treat header (which begins with "chr") as a comment j.command(f'mv {f}.tbi {j.ofile}') b.write_output(j.ofile, f'{out_dir}/{fname}.tbi') if __name__ == "__main__": hl.init(log='/Users/nbaya/Downloads/tabix_sumstats.log') backend = hb.ServiceBackend(billing_project='ukb_diverse_pops', bucket='ukbb-diverse-temp-30day/nb-batch-tmp') b = hb.Batch( name='tabix', backend=backend, default_image='gcr.io/ukbb-diversepops-neale/nbaya_tabix:latest', default_storage='100M', # works with 2G default_cpu=1) # sumstats_dir = f'{bucket}/sumstats_flat_files' # sumstats_dir = f'{ldprune_dir}/export_results/update' # sumstats_dir = f'{ldprune_dir}/loo/sumstats/batch1' sumstats_dir = f'{ldprune_dir}/variant_qc' print(f'\nUsing sumstats from {sumstats_dir}') ss_path_list = get_ss_path_list(sumstats_dir=sumstats_dir) out_dir = f'{sumstats_dir}_tabix' print(f'\nSaving tabix files to {out_dir}\n') for ss_path in ss_path_list:
"""Run check_genotype.py using the analysis runner.""" import os import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='check sample genotype', backend=service_backend) dataproc.hail_dataproc_job( batch, f'check_genotype.py --output={OUTPUT}', max_age='5h', num_secondary_workers=50, packages=['click'], job_name='check sample genotype', ) batch.run()
#!/usr/bin/env python3 """Demonstrates the use of the dataproc module.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='dataproc example', backend=service_backend) cluster = dataproc.setup_dataproc( batch, max_age='1h', packages=['click', 'selenium'], init=['gs://cpg-reference/hail_dataproc/install_common.sh'], cluster_name='My Cluster with max-age=1h', ) cluster.add_job('query.py', job_name='example') # Don't wait, which avoids resubmissions if this job gets preempted. batch.run(wait=False)
import os import sys import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') POP = sys.argv[1] if len(sys.argv) > 1 else 'nfe' service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name=f'{POP} kccg-reprocessed', backend=service_backend) dataproc.hail_dataproc_job( batch, f'project_reprocessed_kccg_samples.py --output={OUTPUT} --pop {POP}', max_age='5h', packages=['click', 'selenium'], init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'], job_name=f'{POP}-kccg-reprocessed', ) batch.run()