def main(): """ Create a Hail Batch analysis-runner helper creates a DataProc cluster, add the job Set off the batch """ service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET'), ) # create a hail batch batch = hb.Batch(name='cohort_mt_extraction', backend=service_backend) _my_job = dataproc.hail_dataproc_job( batch=batch, script=' '.join(sys.argv[1:]), max_age='4h', job_name='extract_from_cohort_mt', num_secondary_workers=4, cluster_name='cohort_mt_extraction with max-age=4h', ) # noqa: F841 batch.run(wait=False)
def main(script: str, mt: str): """ runs a script inside dataproc to execute VEP :param script: str, the path to the VEP main script """ service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET'), ) # create a hail batch batch = hb.Batch(name='run_vep_in_dataproc_cluster', backend=service_backend) job = dataproc.hail_dataproc_job( batch=batch, worker_machine_type='n1-highmem-8', worker_boot_disk_size=200, secondary_worker_boot_disk_size=200, script=f'{script} --mt {mt}', max_age='12h', init=[ 'gs://cpg-reference/hail_dataproc/install_common.sh', 'gs://cpg-reference/vep/vep-GRCh38.sh', ], job_name='run_vep', num_secondary_workers=20, num_workers=2, cluster_name='run vep', ) job.cpu(2) job.memory('standard') job.storage('20G') batch.run(wait=False)
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='variant selection', backend=service_backend) dataproc.hail_dataproc_job( batch, 'variant_selection_qc_histogram.py', max_age='3h', packages=['click', 'selenium'], init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name='variant-selection-histogram', ) batch.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv("HAIL_BILLING_PROJECT"), bucket=os.getenv("HAIL_BUCKET")) batch = hb.Batch(name="calculate-maf", backend=service_backend) dataproc.hail_dataproc_job( batch, f"calculate_maf.py", max_age="12h", num_secondary_workers=20, init=["gs://cpg-reference/hail_dataproc/install_common.sh"], job_name=f"calculate_maf", worker_boot_disk_size=200, ) batch.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='pca_combined_tob_snp_chip', backend=service_backend) dataproc.hail_dataproc_job( batch, 'plot_pca_tob_wgs_snp_chip_datasets.py', max_age='1h', packages=['click', 'selenium'], init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name=f'pca_combined_tob_snp_chip', ) batch.run()
"""Run gnomad_loadings_90k_liftover.py using the analysis runner.""" import os import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='gnomad loadings liftover', backend=service_backend) dataproc.hail_dataproc_job( batch, f'gnomad_loadings_90k_liftover.py --output={OUTPUT}', max_age='1h', packages=['click'], job_name='gnomad-loadings-liftover', ) batch.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='king-nfe', backend=service_backend) dataproc.hail_dataproc_job( batch, f'king_nfe.py', max_age='12h', num_secondary_workers=20, init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name=f'king-nfe', worker_boot_disk_size=200, ) batch.run()
"""Run check_genotype.py using the analysis runner.""" import os import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='check sample genotype', backend=service_backend) dataproc.hail_dataproc_job( batch, f'check_genotype.py --output={OUTPUT}', max_age='5h', num_secondary_workers=50, packages=['click'], job_name='check sample genotype', ) batch.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='export_plink', backend=service_backend) dataproc.hail_dataproc_job( batch, f'export_plink.py', max_age='4h', num_secondary_workers=20, init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name=f'export_plink', worker_boot_disk_size=200, ) batch.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET') ) batch = hb.Batch(name='related_samples-save', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_tob_wgs_related_samples.py', max_age='2h', init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name=f'related_samples-save', ) batch.run()
"""Entry point for the analysis runner.""" import os import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET') ) batch = hb.Batch(name=f'densify_tobwgs_pca', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_tob_wgs_densified_pca.py --output={OUTPUT}', max_age='12h', num_secondary_workers=20, packages=['click'], job_name=f'densify_tobwgs_pca', worker_boot_disk_size=200, ) batch.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name=f'increase_partitions', backend=service_backend) dataproc.hail_dataproc_job( batch, 'increase_snp_chip_partitions.py', max_age='2h', num_workers=20, init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name=f'increase_partitions', ) batch.run()
import os import sys import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') POP = sys.argv[1] if len(sys.argv) > 1 else 'nfe' service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name=f'{POP} pca', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_tob_wgs_pop_pca_densified.py --output={OUTPUT} --pop {POP}', max_age='4h', num_secondary_workers=20, packages=['click'], job_name=f'{POP}-pca-densified', ) batch.run()
import os import sys import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') POP = sys.argv[1] if len(sys.argv) > 1 else 'nfe' service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name=f'{POP} pca', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_tob_wgs_pop.py --output={OUTPUT} --pop {POP}', max_age='24h', num_workers=50, packages=['click'], job_name=f'{POP}-pca', ) batch.run()
"""Run hgdp_1kg_tob_wgs_variant_selection.py using the analysis runner.""" import os import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET') ) batch = hb.Batch(name='variant selection exploration', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_tob_wgs_variant_selection_exploration.py --output={OUTPUT}', max_age='12h', num_secondary_workers=20, packages=['click', 'selenium'], init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'], job_name='variant-selection-exploration', ) batch.run()
import os import sys import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') POP = sys.argv[1] if len(sys.argv) > 1 else 'nfe' service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name=f'{POP} project snp-chip', backend=service_backend) dataproc.hail_dataproc_job( batch, f'project_snp_chip_data.py --output={OUTPUT} --pop {POP}', max_age='5h', packages=['click', 'selenium'], init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'], job_name=f'{POP}-project-snp-chip', ) batch.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name=f'snp_chip_variants_pca', backend=service_backend) dataproc.hail_dataproc_job( batch, 'snp_chip_generate_pca.py', max_age='12h', num_secondary_workers=20, packages=['click'], init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name=f'snp_chip_variants_pca', ) batch.run()
"""Run hgdp_1kg_tob_wgs_variant_selection.py using the analysis runner.""" import os import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='variant selection', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_tob_wgs_variant_selection.py --output={OUTPUT}', max_age='12h', num_secondary_workers=20, packages=['click'], job_name='variant-selection', ) batch.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET') ) batch = hb.Batch(name='new-variants-plot-pca-nfe', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_tob_wgs_plot_pca_nfe.py', max_age='1h', packages=['selenium'], init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name='new-variants-plot-pca-nfe', ) batch.run()
"""Entry point for the analysis runner.""" import os import sys import hailtop.batch as hb from analysis_runner import dataproc POP = sys.argv[1] if len(sys.argv) > 1 else 'nfe' service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name=f'{POP} pca', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_tob_wgs_pop_pca_densified.py --pop {POP}', max_age='4h', num_secondary_workers=20, packages=['click'], init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name=f'{POP}-pca-new-variants', worker_boot_disk_size=200, ) batch.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='new-variants-plot-pca', backend=service_backend) dataproc.hail_dataproc_job( batch, 'plot_pca_and_loadings.py', max_age='2h', packages=['selenium'], init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name='new-variants-plot-pca', ) batch.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='tob-wgs-scree', backend=service_backend) dataproc.hail_dataproc_job( batch, 'tob_wgs_scree_plot.py', max_age='3h', packages=['selenium'], init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name='tob-wgs-scree', ) batch.run()
import os import sys import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') POP = sys.argv[1] if len(sys.argv) > 1 else 'nfe' service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name=f'{POP} kccg-reprocessed', backend=service_backend) dataproc.hail_dataproc_job( batch, f'project_reprocessed_kccg_samples.py --output={OUTPUT} --pop {POP}', max_age='5h', packages=['click', 'selenium'], init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'], job_name=f'{POP}-kccg-reprocessed', ) batch.run()
"""Run hgdp_1kg_pca_10k_random.py using the analysis runner.""" import os import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='generate PCA 10k', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_pca_10k_random.py --output={OUTPUT}', max_age='15h', num_secondary_workers=100, packages=['click'], job_name='PCA-loadings', ) batch.run()
"""Entry point for the analysis runner.""" import os import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='densified loadings nfe', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_tob_wgs_plot_loadings_nfe.py --output={OUTPUT}', max_age='1h', packages=['click', 'selenium'], init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'], job_name=f'densified loadings nfe', ) batch.run()
"""Run hgdp_1kg_tob_wgs_pca.py using the analysis runner.""" import os import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='hgdp1kg tobwgs pca', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_tob_wgs_pca.py --output={OUTPUT}', max_age='24h', num_workers=50, packages=['click'], job_name='hgdp1kg-tobwgs-pca', ) batch.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET') ) batch = hb.Batch(name='nfe-pca-no-outliers', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_tob_wgs_nfe_pca_no_outliers.py', max_age='4h', num_secondary_workers=20, init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name=f'nfe-pca-no-outliers', worker_boot_disk_size=200, ) batch.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='plot-loadings-nfe-no-outliers', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_tob_wgs_plot_loadings_nfe_no_outliers.py', max_age='4h', num_secondary_workers=20, packages=['selenium'], init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name=f'plot-loadings-nfe-no-outliers', ) batch.run()
"""Entry point for the analysis runner.""" import os import hailtop.batch as hb from analysis_runner import dataproc service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='plot_snp_chip_pca', backend=service_backend) dataproc.hail_dataproc_job( batch, 'plot_tob_snp_chip_pca_only.py', max_age='1h', packages=['selenium'], init=['gs://cpg-reference/hail_dataproc/install_common.sh'], job_name=f'plot_snp_chip_pca', ) batch.run()
"""Run hgdp_1kg_ld_prune.py using the analysis runner.""" import os import hail as hl import hailtop.batch as hb from analysis_runner import dataproc OUTPUT = os.getenv('OUTPUT') assert OUTPUT hl.init(default_reference='GRCh38') service_backend = hb.ServiceBackend( billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')) batch = hb.Batch(name='ld pruning', backend=service_backend) dataproc.hail_dataproc_job( batch, f'hgdp_1kg_ld_prune.py --output={OUTPUT}', max_age='5h', num_secondary_workers=100, packages=['click', 'gnomad'], job_name='ld-prune', worker_boot_disk_size=200, ) batch.run()