示例#1
0
def main():
    """
    Create a Hail Batch
    analysis-runner helper creates a DataProc cluster, add the job
    Set off the batch
    """

    service_backend = hb.ServiceBackend(
        billing_project=os.getenv('HAIL_BILLING_PROJECT'),
        bucket=os.getenv('HAIL_BUCKET'),
    )

    # create a hail batch
    batch = hb.Batch(name='cohort_mt_extraction', backend=service_backend)

    _my_job = dataproc.hail_dataproc_job(
        batch=batch,
        script=' '.join(sys.argv[1:]),
        max_age='4h',
        job_name='extract_from_cohort_mt',
        num_secondary_workers=4,
        cluster_name='cohort_mt_extraction with max-age=4h',
    )  # noqa: F841

    batch.run(wait=False)
示例#2
0
def main(script: str, mt: str):
    """
    runs a script inside dataproc to execute VEP
    :param script: str, the path to the VEP main script
    """

    service_backend = hb.ServiceBackend(
        billing_project=os.getenv('HAIL_BILLING_PROJECT'),
        bucket=os.getenv('HAIL_BUCKET'),
    )

    # create a hail batch
    batch = hb.Batch(name='run_vep_in_dataproc_cluster',
                     backend=service_backend)

    job = dataproc.hail_dataproc_job(
        batch=batch,
        worker_machine_type='n1-highmem-8',
        worker_boot_disk_size=200,
        secondary_worker_boot_disk_size=200,
        script=f'{script} --mt {mt}',
        max_age='12h',
        init=[
            'gs://cpg-reference/hail_dataproc/install_common.sh',
            'gs://cpg-reference/vep/vep-GRCh38.sh',
        ],
        job_name='run_vep',
        num_secondary_workers=20,
        num_workers=2,
        cluster_name='run vep',
    )
    job.cpu(2)
    job.memory('standard')
    job.storage('20G')

    batch.run(wait=False)
示例#3
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='variant selection', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    'variant_selection_qc_histogram.py',
    max_age='3h',
    packages=['click', 'selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name='variant-selection-histogram',
)

batch.run()
示例#4
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv("HAIL_BILLING_PROJECT"),
    bucket=os.getenv("HAIL_BUCKET"))

batch = hb.Batch(name="calculate-maf", backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f"calculate_maf.py",
    max_age="12h",
    num_secondary_workers=20,
    init=["gs://cpg-reference/hail_dataproc/install_common.sh"],
    job_name=f"calculate_maf",
    worker_boot_disk_size=200,
)

batch.run()
示例#5
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='pca_combined_tob_snp_chip', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    'plot_pca_tob_wgs_snp_chip_datasets.py',
    max_age='1h',
    packages=['click', 'selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name=f'pca_combined_tob_snp_chip',
)

batch.run()
示例#6
0
"""Run gnomad_loadings_90k_liftover.py using the analysis runner."""

import os
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='gnomad loadings liftover', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'gnomad_loadings_90k_liftover.py --output={OUTPUT}',
    max_age='1h',
    packages=['click'],
    job_name='gnomad-loadings-liftover',
)

batch.run()
示例#7
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='king-nfe', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'king_nfe.py',
    max_age='12h',
    num_secondary_workers=20,
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name=f'king-nfe',
    worker_boot_disk_size=200,
)

batch.run()
示例#8
0
"""Run check_genotype.py using the analysis runner."""

import os
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='check sample genotype', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'check_genotype.py --output={OUTPUT}',
    max_age='5h',
    num_secondary_workers=50,
    packages=['click'],
    job_name='check sample genotype',
)

batch.run()
示例#9
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='export_plink', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'export_plink.py',
    max_age='4h',
    num_secondary_workers=20,
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name=f'export_plink',
    worker_boot_disk_size=200,
)

batch.run()
示例#10
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')
)

batch = hb.Batch(name='related_samples-save', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_tob_wgs_related_samples.py',
    max_age='2h',
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name=f'related_samples-save',
)

batch.run()
示例#11
0
"""Entry point for the analysis runner."""

import os
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')
)

batch = hb.Batch(name=f'densify_tobwgs_pca', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_tob_wgs_densified_pca.py --output={OUTPUT}',
    max_age='12h',
    num_secondary_workers=20,
    packages=['click'],
    job_name=f'densify_tobwgs_pca',
    worker_boot_disk_size=200,
)

batch.run()
示例#12
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name=f'increase_partitions', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    'increase_snp_chip_partitions.py',
    max_age='2h',
    num_workers=20,
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name=f'increase_partitions',
)

batch.run()
示例#13
0
import os
import sys
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

POP = sys.argv[1] if len(sys.argv) > 1 else 'nfe'

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name=f'{POP} pca', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_tob_wgs_pop_pca_densified.py --output={OUTPUT} --pop {POP}',
    max_age='4h',
    num_secondary_workers=20,
    packages=['click'],
    job_name=f'{POP}-pca-densified',
)

batch.run()
示例#14
0
import os
import sys
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

POP = sys.argv[1] if len(sys.argv) > 1 else 'nfe'

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name=f'{POP} pca', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_tob_wgs_pop.py --output={OUTPUT} --pop {POP}',
    max_age='24h',
    num_workers=50,
    packages=['click'],
    job_name=f'{POP}-pca',
)

batch.run()
示例#15
0
"""Run hgdp_1kg_tob_wgs_variant_selection.py using the analysis runner."""

import os
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')
)

batch = hb.Batch(name='variant selection exploration', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_tob_wgs_variant_selection_exploration.py --output={OUTPUT}',
    max_age='12h',
    num_secondary_workers=20,
    packages=['click', 'selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'],
    job_name='variant-selection-exploration',
)

batch.run()
示例#16
0
import os
import sys
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

POP = sys.argv[1] if len(sys.argv) > 1 else 'nfe'

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name=f'{POP} project snp-chip', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'project_snp_chip_data.py --output={OUTPUT} --pop {POP}',
    max_age='5h',
    packages=['click', 'selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'],
    job_name=f'{POP}-project-snp-chip',
)

batch.run()
示例#17
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name=f'snp_chip_variants_pca', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    'snp_chip_generate_pca.py',
    max_age='12h',
    num_secondary_workers=20,
    packages=['click'],
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name=f'snp_chip_variants_pca',
)

batch.run()
示例#18
0
"""Run hgdp_1kg_tob_wgs_variant_selection.py using the analysis runner."""

import os
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='variant selection', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_tob_wgs_variant_selection.py --output={OUTPUT}',
    max_age='12h',
    num_secondary_workers=20,
    packages=['click'],
    job_name='variant-selection',
)

batch.run()
示例#19
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')
)

batch = hb.Batch(name='new-variants-plot-pca-nfe', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_tob_wgs_plot_pca_nfe.py',
    max_age='1h',
    packages=['selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name='new-variants-plot-pca-nfe',
)

batch.run()
示例#20
0
"""Entry point for the analysis runner."""

import os
import sys
import hailtop.batch as hb
from analysis_runner import dataproc

POP = sys.argv[1] if len(sys.argv) > 1 else 'nfe'

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name=f'{POP} pca', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_tob_wgs_pop_pca_densified.py --pop {POP}',
    max_age='4h',
    num_secondary_workers=20,
    packages=['click'],
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name=f'{POP}-pca-new-variants',
    worker_boot_disk_size=200,
)

batch.run()
示例#21
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='new-variants-plot-pca', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    'plot_pca_and_loadings.py',
    max_age='2h',
    packages=['selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name='new-variants-plot-pca',
)

batch.run()
示例#22
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='tob-wgs-scree', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    'tob_wgs_scree_plot.py',
    max_age='3h',
    packages=['selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name='tob-wgs-scree',
)

batch.run()
示例#23
0
import os
import sys
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

POP = sys.argv[1] if len(sys.argv) > 1 else 'nfe'

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name=f'{POP} kccg-reprocessed', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'project_reprocessed_kccg_samples.py --output={OUTPUT} --pop {POP}',
    max_age='5h',
    packages=['click', 'selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'],
    job_name=f'{POP}-kccg-reprocessed',
)

batch.run()
示例#24
0
"""Run hgdp_1kg_pca_10k_random.py using the analysis runner."""

import os
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='generate PCA 10k', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_pca_10k_random.py --output={OUTPUT}',
    max_age='15h',
    num_secondary_workers=100,
    packages=['click'],
    job_name='PCA-loadings',
)

batch.run()
示例#25
0
"""Entry point for the analysis runner."""

import os
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='densified loadings nfe', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_tob_wgs_plot_loadings_nfe.py --output={OUTPUT}',
    max_age='1h',
    packages=['click', 'selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'],
    job_name=f'densified loadings nfe',
)

batch.run()
示例#26
0
"""Run hgdp_1kg_tob_wgs_pca.py using the analysis runner."""

import os
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='hgdp1kg tobwgs pca', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_tob_wgs_pca.py --output={OUTPUT}',
    max_age='24h',
    num_workers=50,
    packages=['click'],
    job_name='hgdp1kg-tobwgs-pca',
)

batch.run()
示例#27
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')
)

batch = hb.Batch(name='nfe-pca-no-outliers', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_tob_wgs_nfe_pca_no_outliers.py',
    max_age='4h',
    num_secondary_workers=20,
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name=f'nfe-pca-no-outliers',
    worker_boot_disk_size=200,
)

batch.run()
示例#28
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='plot-loadings-nfe-no-outliers', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_tob_wgs_plot_loadings_nfe_no_outliers.py',
    max_age='4h',
    num_secondary_workers=20,
    packages=['selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name=f'plot-loadings-nfe-no-outliers',
)

batch.run()
示例#29
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='plot_snp_chip_pca', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    'plot_tob_snp_chip_pca_only.py',
    max_age='1h',
    packages=['selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name=f'plot_snp_chip_pca',
)

batch.run()
示例#30
0
"""Run hgdp_1kg_ld_prune.py using the analysis runner."""

import os
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='ld pruning', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_ld_prune.py --output={OUTPUT}',
    max_age='5h',
    num_secondary_workers=100,
    packages=['click', 'gnomad'],
    job_name='ld-prune',
    worker_boot_disk_size=200,
)

batch.run()