Пример #1
0
def stress():
    b = hb.Batch(name='stress',
                 backend=hb.ServiceBackend(billing_project='hail'),
                 default_image=DOCKER_ROOT_IMAGE)

    for i in range(100):
        j = b.new_job(name=f'parent_{i}')
        d = random.choice(range(4))
        if flip(0.2):
            j.command(f'sleep {d}; exit 1')
        else:
            j.command(f'sleep {d}; echo parent {i}')

        for k in range(10):
            d = random.choice(range(4))
            c = b.new_job(name=f'child_{i}_{k}').command(
                f'sleep {d}; echo child {i} {k}')
            c.depends_on(j)
            if flip(0.1):
                c._always_run = True
            if flip(0.01):
                c._machine_type = 'n1-standard-1'
                if flip(0.5):
                    c._preemptible = False

    b.run(open=False, wait=False)
Пример #2
0
def main(df_x_path, df_y_path, output_path, python_image):
    backend = hb.ServiceBackend()
    b = hb.Batch(name='rf-loo', default_python_image=python_image)

    with hl.hadoop_open(df_y_path) as f:
        local_df_y = pd.read_table(f, header=0, index_col=0)

    df_x_input = b.read_input(df_x_path)
    df_y_input = b.read_input(df_y_path)

    results = []

    for window in local_df_y.index.to_list():
        checkpoint = checkpoint_path(window)
        if hl.hadoop_exists(checkpoint):
            result = b.read_input(checkpoint)
            results.append(result)
            continue

        j = b.new_python_job()

        result = j.call(random_forest, df_x_input, df_y_input, window)
        tsv_result = j.call(as_tsv, result)
        tsv_result = tsv_result.as_str()

        b.write_output(tsv_result, checkpoint)
        results.append(tsv_result)

    output = hb.concatenate(b, results)
    b.write_output(output, output_path)

    b.run(wait=False)
    backend.close()
Пример #3
0
def run_batch(args, batch_name=None):
    """Wrapper around creating, running, and then closing a Batch run.

    :param args: Parsed args from the ArgumentParser created via the init_arg_parser method
    :param batch_name: (optional) batch label which will show up in the Batch web UI

    Usage:
        with run_batch(args) as batch:
            ... batch job definitions ...
    """

    if args.local:
        backend = (hb.LocalBackend() if args.raw else hb.LocalBackend(
            gsa_key_file=args.gsa_key_file))
    else:
        backend = hb.ServiceBackend(billing_project=args.batch_billing_project,
                                    bucket=args.batch_temp_bucket)

    try:
        batch = hb.Batch(backend=backend, name=batch_name)

        batch.batch_utils_temp_bucket = args.batch_temp_bucket

        yield batch  # returned to with ... as batch:

        # run on end of with..: block
        batch.run(dry_run=args.dry_run, verbose=args.verbose)

    finally:
        if isinstance(backend, hb.ServiceBackend):
            backend.close()
Пример #4
0
def validate_all_objects_in_directory(gs_dir):
    """Validate files with MD5s in the provided gs directory"""
    backend = hb.ServiceBackend(
        billing_project=os.getenv('HAIL_BILLING_PROJECT'),
        bucket=os.getenv('HAIL_BUCKET'),
    )
    b = hb.Batch('validate_md5s', backend=backend)
    client = storage.Client()

    if not gs_dir.startswith('gs://'):
        raise ValueError(f'Expected GS directory, got: {gs_dir}')

    bucket_name, *components = gs_dir[5:].split('/')

    blobs = client.list_blobs(bucket_name, prefix='/'.join(components))
    files: Set[str] = {f'gs://{bucket_name}/{blob.name}' for blob in blobs}
    for obj in files:
        if obj.endswith('.md5'):
            continue
        if f'{obj}.md5' not in files:
            continue

        job = b.new_job(f'validate_{os.path.basename(obj)}')
        job.image(DRIVER_IMAGE)
        validate_md5(job, obj)

    b.run(wait=False)
Пример #5
0
def main():
    """
    Create a Hail Batch
    analysis-runner helper creates a DataProc cluster, add the job
    Set off the batch
    """

    service_backend = hb.ServiceBackend(
        billing_project=os.getenv('HAIL_BILLING_PROJECT'),
        bucket=os.getenv('HAIL_BUCKET'),
    )

    # create a hail batch
    batch = hb.Batch(name='cohort_mt_extraction', backend=service_backend)

    _my_job = dataproc.hail_dataproc_job(
        batch=batch,
        script=' '.join(sys.argv[1:]),
        max_age='4h',
        job_name='extract_from_cohort_mt',
        num_secondary_workers=4,
        cluster_name='cohort_mt_extraction with max-age=4h',
    )  # noqa: F841

    batch.run(wait=False)
Пример #6
0
def run_concat(backend: Union[hb.ServiceBackend, hb.LocalBackend] = None,
               input_vcf: str = None,
               output_type: str = 'vcf',
               cpu: int = 16,
               memory: str = 'standard',
               out_dir: str = None):

    print(f'\n2. CONCAT {input_vcf}\n')
    vcf_filebase = get_vcf_filebase(input_vcf)
    concat_b = hb.Batch(backend=backend, name=f'concat-imputed-chunks-{vcf_filebase}')

    # get the regions so we can map each file to its specific region
    regions = pd.read_csv(f'{out_dir}/GWASpy/{vcf_filebase}/Imputation/imputation.regions', sep='\t', names=['reg', 'ind'])
    regions_dict = pd.Series(regions.reg.values, index=regions.ind).to_dict()

    imputed_vcfs_chunks = hl.utils.hadoop_ls(f'{out_dir}/GWASpy/{vcf_filebase}/Imputation/imputed_chunks/*.bcf')

    for i in range(1, 24):
        if i == 23:
            chrom = 'chrX'
        else:
            chrom = f'chr{i}'

        chrom_phased_files_to_concat = []

        for file in imputed_vcfs_chunks:
            f = file['path']
            vcf_basename = get_vcf_filebase(f)
            file_index = int(vcf_basename.split('.')[-4])
            file_region = regions_dict[file_index]
            map_chrom = file_region.split(':')[0]
            if map_chrom == chrom:
                chrom_phased_files_to_concat.append(f)

        # naturally sort the list of files to merge
        from gwaspy.utils.natural_sort import natural_keys
        chrom_phased_files_to_concat.sort(key=natural_keys)

        concat_vcfs(b=concat_b, vcfs_to_merge=chrom_phased_files_to_concat, vcf_basename=vcf_filebase,
                    output_type=output_type, chrom=chrom, cpu=cpu, memory=memory, out_dir=out_dir)

    concat_b.run()
Пример #7
0
def main(script: str, mt: str):
    """
    runs a script inside dataproc to execute VEP
    :param script: str, the path to the VEP main script
    """

    service_backend = hb.ServiceBackend(
        billing_project=os.getenv('HAIL_BILLING_PROJECT'),
        bucket=os.getenv('HAIL_BUCKET'),
    )

    # create a hail batch
    batch = hb.Batch(name='run_vep_in_dataproc_cluster',
                     backend=service_backend)

    job = dataproc.hail_dataproc_job(
        batch=batch,
        worker_machine_type='n1-highmem-8',
        worker_boot_disk_size=200,
        secondary_worker_boot_disk_size=200,
        script=f'{script} --mt {mt}',
        max_age='12h',
        init=[
            'gs://cpg-reference/hail_dataproc/install_common.sh',
            'gs://cpg-reference/vep/vep-GRCh38.sh',
        ],
        job_name='run_vep',
        num_secondary_workers=20,
        num_workers=2,
        cluster_name='run vep',
    )
    job.cpu(2)
    job.memory('standard')
    job.storage('20G')

    batch.run(wait=False)
Пример #8
0
def main():
    
    use_tabix = True
    
    hl.init(log='/Users/nbaya/Downloads/get_chr_pos.log')
    backend = hb.ServiceBackend(billing_project='ukb_diverse_pops',
                                bucket='ukbb-diverse-temp-30day/nb-batch-tmp')
    
    b = hb.Batch(name='get_chr_pos', backend=backend,
                 default_image='gcr.io/ukbb-diversepops-neale/nbaya_tabix:latest',
                 default_storage='2G', default_cpu=1)

    
    paths = get_paths()
    
    for path in paths:
        print(path)
        annotate_chr_pos(b=b,
                         path=path,
                         use_tabix=use_tabix)
    
    b.run(open=True)
    
    backend.close()
to the gs://cpg-fewgenomes-main bucket.
"""

import os
import csv
import hailtop.batch as hb

# OUTPUT gets propagated from the analysis-runner cli to the server
output_bucket = os.getenv('OUTPUT')
assert output_bucket and output_bucket.startswith('gs://cpg-fewgenomes-main/')

# input CSV contains 3 columns with sample name, file type, full GCS file path
INPUT_FILELIST = './data/filtered65.csv'
ANALYSIS_RUNNER_IMAGE = 'australia-southeast1-docker.pkg.dev/analysis-runner/images/driver:45c3f8125e300cd70bb790e32d96816f003a7af2-hail-0.2.64.devcb1c44c7b529'

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')
)
b = hb.Batch(backend=service_backend, name='copy-crams')

# Create Hail Batch jobs to copy CRAMs and indexes listed in CSV file to output bucket
with open(INPUT_FILELIST, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        j_copy = b.new_job(name=f'copy-{row["sample_name"]}-{row["ftype"]}')
        (j_copy.image(ANALYSIS_RUNNER_IMAGE)
               .command(f'gcloud -q auth activate-service-account --key-file=/gsa-key/key.json')
               .command(f'gsutil cp {row["fname"]} {output_bucket}'))

b.run()
Пример #10
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')
)

batch = hb.Batch(name='nfe-pca-no-outliers', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_tob_wgs_nfe_pca_no_outliers.py',
    max_age='4h',
    num_secondary_workers=20,
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name=f'nfe-pca-no-outliers',
    worker_boot_disk_size=200,
)

batch.run()
Пример #11
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='pca_combined_tob_snp_chip', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    'plot_pca_tob_wgs_snp_chip_datasets.py',
    max_age='1h',
    packages=['click', 'selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name=f'pca_combined_tob_snp_chip',
)

batch.run()
Пример #12
0
import os

import hailtop.batch as hb 

backend = hb.ServiceBackend(
    billing_project='leonhardgruenschloss-trial',
    bucket='leo-tmp-au')

b = hb.Batch(backend=backend, name='outer') 

j = b.new_job(name='launch-inner')
j.image(f'gcr.io/{os.getenv("GCP_PROJECT")}/hail-batch-nested:latest')
j.command('python3 inner.py')

b.run()
Пример #13
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name=f'snp_chip_variants_pca', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    'snp_chip_generate_pca.py',
    max_age='12h',
    num_secondary_workers=20,
    packages=['click'],
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name=f'snp_chip_variants_pca',
)

batch.run()
Пример #14
0
def submit(hail_code: Commit,
           benchmark_code: Commit,
           test_names: Set[str],
           n_replicates: int,
           n_iters: int):

    sync_check_shell(benchmark_code.checkout_script())

    sys.path.insert(0, f'{benchmark_code.repo_dir()}/benchmark/python/benchmark_hail')

    importlib.invalidate_caches  # pylint: disable=pointless-statement
    from benchmark_hail.run.resources import all_resources  # pylint: disable=import-error, import-outside-toplevel
    from benchmark_hail.run.utils import list_benchmarks  # pylint: disable=import-error, import-outside-toplevel

    output_file = f'gs://hail-benchmarks-2/benchmark/{hail_code.sha}-{benchmark_code.sha}.json'

    b = hb.Batch(name=f'benchmark-{hail_code.sha}',
                 backend=hb.ServiceBackend(billing_project='hail'),
                 default_image=BENCHMARK_IMAGE,
                 default_cpu='2',
                 attributes={'output_file': output_file,
                             'n_replicates': str(n_replicates),
                             'n_iters': str(n_iters),
                             'image': str(BENCHMARK_IMAGE),
                             'hail_code': str(hail_code),
                             'benchmark_code': str(benchmark_code)})

    build_hail = b.new_job('build_hail_wheel')
    build_hail.command(f'''
 set -ex
 { hail_code.checkout_script() }
 cd hail
 time ./gradlew --version
 time make wheel
 time (cd python && zip -r hail.zip hail hailtop)
 (cd build/deploy/dist/ && tar -cvf wheel-container.tar hail-*-py3-none-any.whl)
 cp build/deploy/dist/hail-*-py3-none-any.whl {build_hail.wheel}
''')

    build_benchmark = b.new_job('build_benchmark_wheel')
    build_benchmark.command(f'''
 set -ex
 {benchmark_code.checkout_script()}
 make -C hail python/hail/hail_pip_version
 export HAIL_VERSION=$(cat hail/python/hail/hail_pip_version)
 export HAIL_BENCHMARK_VERSION=$HAIL_VERSION
 cd benchmark/python/ && python3 setup.py -q bdist_wheel
 python3 -m pip -q install dist/benchmark_hail-$HAIL_VERSION-py3-none-any.whl
 cp dist/benchmark_hail-$HAIL_VERSION-py3-none-any.whl {build_benchmark.wheel}
''')
    resource_jobs = {}
    for r in all_resources:
        j = b.new_job(f'create_resource_{r.name()}').cpu(4)
        j.command(f'mv {build_hail.wheel} hail--py3-none-any.whl')
        j.command('pip install hail--py3-none-any.whl')
        j.command(f'mv {build_benchmark.wheel} benchmark_hail-$HAIL_VERSION-py3-none-any.whl')
        j.command('pip install benchmark_hail-$HAIL_VERSION-py3-none-any.whl')
        j.command(f'hail-bench create-resources --data-dir benchmark-resources --group {r.name()}')
        j.command(f"time tar -cf {r.name()}.tar benchmark-resources/{r.name()} --exclude='*.crc'")
        j.command(f'ls -lh {r.name()}.tar')
        j.command(f'mv {r.name()}.tar {j.ofile}')
        resource_jobs[r] = j

    all_benchmarks = list_benchmarks()
    assert len(all_benchmarks) > 0

    all_output = []

    n_passed_filter = 0
    job_fs = []
    for benchmark in all_benchmarks:
        if benchmark.name in test_names:
            n_passed_filter += 1
            for replicate in range(n_replicates):
                job_fs.append((benchmark.name, replicate, benchmark.groups))

    log.info(f'generating {n_passed_filter} * {n_replicates} = {n_passed_filter * n_replicates} individual benchmark jobs')

    random.shuffle(job_fs)
    for name, replicate, groups in job_fs:
        j = b.new_job(name=f'{name}_{replicate}')
        j.command(f'mv {build_hail.wheel} hail--py3-none-any.whl')
        j.command('pip install hail--py3-none-any.whl')
        j.command(f'mv {build_benchmark.wheel} benchmark_hail--py3-none-any.whl')
        j.command('pip install benchmark_hail--py3-none-any.whl')
        j.command('mkdir -p benchmark-resources')
        for resource_group in groups:
            resource_job = resource_jobs[resource_group]
            j.command(f'mv {resource_job.ofile} benchmark-resources/{resource_group.name()}.tar')
            j.command(f'time tar -xf benchmark-resources/{resource_group.name()}.tar')
        j.command(f'MKL_NUM_THREADS=1'
                  f'OPENBLAS_NUM_THREADS=1'
                  f'OMP_NUM_THREADS=1'
                  f'VECLIB_MAXIMUM_THREADS=1'
                  f'PYSPARK_SUBMIT_ARGS="--driver-memory 6G pyspark-shell" '
                  f'hail-bench run -o {j.ofile} -n {n_iters} --data-dir benchmark-resources -t {name}')
        all_output.append(j.ofile)

    combine_branch_factor = int(os.environ.get('BENCHMARK_BRANCH_FACTOR', 32))
    phase_i = 1
    while len(all_output) > combine_branch_factor:
        new_output = []

        job_i = 1
        i = 0
        while i < len(all_output):
            combine = b.new_job(f'combine_output_phase{phase_i}_job{job_i}')
            combine.command(f'mv {build_hail.wheel} hail--py3-none-any.whl')
            combine.command('pip install hail--py3-none-any.whl')
            combine.command(f'mv {build_benchmark.wheel} benchmark_hail--py3-none-any.whl')
            combine.command('pip install benchmark_hail--py3-none-any.whl')
            combine.command(
                f'hail-bench combine -o {combine.ofile} ' + ' '.join(all_output[i:i + combine_branch_factor]))
            new_output.append(combine.ofile)
            i += combine_branch_factor
            job_i += 1

        phase_i += 1
        all_output = new_output
    combine = b.new_job('final_combine_output')
    combine.command(f'mv {build_hail.wheel} hail--py3-none-any.whl')
    combine.command('pip install hail--py3-none-any.whl')
    combine.command(f'mv {build_benchmark.wheel} benchmark_hail--py3-none-any.whl')
    combine.command('pip install benchmark_hail--py3-none-any.whl')
    combine.command(f'hail-bench combine -o {combine.ofile} ' + ' '.join(all_output))
    combine.command(f'cat {combine.ofile}')

    log.info(f'writing output to {output_file}')

    b.write_output(combine.ofile, output_file)
    b.run()
Пример #15
0
    SHA = sys.argv[3]
    N_REPLICATES = int(sys.argv[4])
    N_ITERS = int(sys.argv[5])

    labeled_sha = SHA
    label = os.environ.get('BENCHMARK_LABEL')
    if label:
        labeled_sha = f'{labeled_sha}-{label}'
    output_file = os.path.join(BUCKET_BASE, f'{labeled_sha}.json')

    b = hb.Batch(name=f'benchmark-{labeled_sha}',
                 backend=hb.ServiceBackend(billing_project='hail'),
                 default_image=BENCHMARK_IMAGE,
                 default_storage='100G',
                 default_memory='7G',
                 default_cpu=2,
                 attributes={
                     'output_file': output_file,
                     'n_replicates': str(N_REPLICATES),
                     'n_iters': str(N_ITERS),
                     'image': str(BENCHMARK_IMAGE)
                 })

    resource_tasks = {}
    for r in all_resources:
        j = b.new_job(f'create_resource_{r.name()}').cpu(4)
        j.command(
            f'hail-bench create-resources --data-dir benchmark-resources --group {r.name()}'
        )
        j.command(
            f"time tar -cf {r.name()}.tar benchmark-resources/{r.name()} --exclude='*.crc'"
        )
Пример #16
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='new-variants-plot-pca', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    'plot_pca_and_loadings.py',
    max_age='2h',
    packages=['selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name='new-variants-plot-pca',
)

batch.run()
Пример #17
0
    parser.add_argument('--scatter-count', type=int, default=50)
    parser.add_argument('--out-dir', default='gs://african-seq-data')

    args = parser.parse_args()

    if args.local:
        backend = hb.LocalBackend()
    else:
        backend = hb.ServiceBackend(billing_project=args.billing_project,
                                    bucket=args.bucket)

    ref_fasta_size = bytes_to_gb(args.ref_fasta)
    ref_dict_size = bytes_to_gb(args.ref_dict)
    ref_ind_size = bytes_to_gb(args.ref_index)

    scatter = hb.Batch(backend=backend, name='scatter-interval-list')
    calling_interval_list = scatter.read_input(args.calling_interval_list)
    scatter_intervals = scatter_interval_list(
        b=scatter,
        interval_list_file=calling_interval_list,
        out_dir=args.out_dir,
        scatter_count=args.scatter_count)
    scatter.run()

    interval_files = hl.utils.hadoop_ls(f'{args.out_dir}/scatter-intervals/**')

    var_call = hb.Batch(backend=backend, name='variant-calling')
    fasta = var_call.read_input_group(
        **{
            'fasta': args.ref_fasta,
            'fasta.fai': args.ref_index,
Пример #18
0
        "https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh37/whole_genome_SNVs.tsv.gz",
        "indels_url":
        "https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh37/InDels.tsv.gz",
        "version": "v1.6"
    },
    "GRCh38": {
        "snvs_url":
        "https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz",
        "indels_url":
        "https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/gnomad.genomes.r3.0.indel.tsv.gz",
        "version": "v1.6"
    }
}

backend = hb.ServiceBackend(billing_project="hail-datasets-api")
batch = hb.Batch(backend=backend, name=name)
for build in ["GRCh37", "GRCh38"]:
    snvs_url = builds[build]["snvs_url"]
    indels_url = builds[build]["indels_url"]
    version = builds[build]["version"]

    j = batch.new_job(name=f"{name}_{version}_{build}")
    j.image("gcr.io/broad-ctsa/datasets:050521")
    j.command(
        "gcloud -q auth activate-service-account --key-file=/gsa-key/key.json")
    j.command(
        f"wget -c -O - {snvs_url} {indels_url} | "
        "zcat | "
        "grep -v '^#' | "
        """awk -v FS=$'\t' -v OFS=$'\t' 'BEGIN {print "chromosome","position","ref","alt","raw_score","PHRED_score"} {print $0}' | """
        "bgzip -c | "
Пример #19
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='king-nfe', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'king_nfe.py',
    max_age='12h',
    num_secondary_workers=20,
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name=f'king-nfe',
    worker_boot_disk_size=200,
)

batch.run()
Пример #20
0
"""Entry point for the analysis runner."""

import os
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'), bucket=os.getenv('HAIL_BUCKET')
)

batch = hb.Batch(name='densified pca all samples', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_tob_wgs_plot_pca_all_samples.py --output={OUTPUT}',
    max_age='1h',
    packages=['click', 'selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'],
    job_name=f'densified pca all samples',
)

batch.run()
Пример #21
0
import os
import sys
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

POP = sys.argv[1] if len(sys.argv) > 1 else 'nfe'

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name=f'{POP} project snp-chip', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'project_snp_chip_data.py --output={OUTPUT} --pop {POP}',
    max_age='5h',
    packages=['click', 'selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'],
    job_name=f'{POP}-project-snp-chip',
)

batch.run()
Пример #22
0
        # TODO: Fill in the location of your demo image in GCR
        # Fill this in when running LD-clumping on the service
        # This should look something like gcr.io/atgu-training/batch-demo-<user>:latest
        BATCH_DEMO_IMAGE = 'gcr.io/atgu-training/batch-demo-jigold:latest'

        # TODO: Fill in the name of <YOUR_BILLING_PROJECT> and <YOUR_BUCKET>
        # Fill this in when running LD-clumping on the service        
        # The billing project for the workshop is 'atgu-welcome-workshop'.
        # The bucket is the name of the bucket that you configured your service account to have access to. Do not include the gs://
        # In the future, you can use hailctl config to set defaults for these parameters
        # `hailctl config set batch/billing_project my-billing-project`
        # `hailctl config set batch/bucket my-bucket        
        backend = hb.ServiceBackend(billing_project='atgu-welcome-workshop',
                                    bucket='batch-tmp-jigold')

    batch = hb.Batch(backend=backend,
                     name='clumping-demo')
    # Define inputs
    vcf = batch.read_input(args.vcf)
    # TODO: We want to read the input file for the phenotypes and make it an InputResourceFile
    # look at the vcf file above for an example of creating an InputResourceFile. The phenotypes
    # file is passed as `args.phenotypes`
    phenotypes = batch.read_input(args.phenotypes)

    # QC and compute gwas assoc results
    # TODO: Fill in the argument parameters to the `run_gwas` function
    # This will add a new job to the Batch `batch` that runs a GWAS in Hail
    # and exports the dataset to PLINK format. It also takes as arguments the batch to use, the name
    # of the Docker image to use, a VCF file and a file with the phenotypes.
    gwas = run_gwas(batch, BATCH_DEMO_IMAGE, vcf, phenotypes)

    # Run PLINK clumping once per chromosome
Пример #23
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv("HAIL_BILLING_PROJECT"),
    bucket=os.getenv("HAIL_BUCKET"))

batch = hb.Batch(name="calculate-maf", backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f"calculate_maf.py",
    max_age="12h",
    num_secondary_workers=20,
    init=["gs://cpg-reference/hail_dataproc/install_common.sh"],
    job_name=f"calculate_maf",
    worker_boot_disk_size=200,
)

batch.run()
Пример #24
0
"""Run gnomad_loadings_90k_liftover.py using the analysis runner."""

import os
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='gnomad loadings liftover', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'gnomad_loadings_90k_liftover.py --output={OUTPUT}',
    max_age='1h',
    packages=['click'],
    job_name='gnomad-loadings-liftover',
)

batch.run()
Пример #25
0
"""Entry point for the analysis runner."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='plot_snp_chip_pca', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    'plot_tob_snp_chip_pca_only.py',
    max_age='1h',
    packages=['selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    job_name=f'plot_snp_chip_pca',
)

batch.run()
Пример #26
0
"""Entry point for the analysis runner."""

import os
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='densified loadings nfe', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'hgdp_1kg_tob_wgs_plot_loadings_nfe.py --output={OUTPUT}',
    max_age='1h',
    packages=['click', 'selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'],
    job_name=f'densified loadings nfe',
)

batch.run()
Пример #27
0
    f = b.read_input(ss_path)
    j = b.new_job(name=fname.split('.')[0])
    j.command(f'tabix -s 1 -b 2 -e 2 -c chr {f}'
              )  # treat header (which begins with "chr") as a comment
    j.command(f'mv {f}.tbi {j.ofile}')
    b.write_output(j.ofile, f'{out_dir}/{fname}.tbi')


if __name__ == "__main__":
    hl.init(log='/Users/nbaya/Downloads/tabix_sumstats.log')
    backend = hb.ServiceBackend(billing_project='ukb_diverse_pops',
                                bucket='ukbb-diverse-temp-30day/nb-batch-tmp')

    b = hb.Batch(
        name='tabix',
        backend=backend,
        default_image='gcr.io/ukbb-diversepops-neale/nbaya_tabix:latest',
        default_storage='100M',  # works with 2G
        default_cpu=1)

    #    sumstats_dir = f'{bucket}/sumstats_flat_files'
    #    sumstats_dir = f'{ldprune_dir}/export_results/update'
    #    sumstats_dir = f'{ldprune_dir}/loo/sumstats/batch1'
    sumstats_dir = f'{ldprune_dir}/variant_qc'
    print(f'\nUsing sumstats from {sumstats_dir}')

    ss_path_list = get_ss_path_list(sumstats_dir=sumstats_dir)

    out_dir = f'{sumstats_dir}_tabix'
    print(f'\nSaving tabix files to {out_dir}\n')

    for ss_path in ss_path_list:
Пример #28
0
"""Run check_genotype.py using the analysis runner."""

import os
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='check sample genotype', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'check_genotype.py --output={OUTPUT}',
    max_age='5h',
    num_secondary_workers=50,
    packages=['click'],
    job_name='check sample genotype',
)

batch.run()
Пример #29
0
#!/usr/bin/env python3
"""Demonstrates the use of the dataproc module."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='dataproc example', backend=service_backend)

cluster = dataproc.setup_dataproc(
    batch,
    max_age='1h',
    packages=['click', 'selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    cluster_name='My Cluster with max-age=1h',
)
cluster.add_job('query.py', job_name='example')

# Don't wait, which avoids resubmissions if this job gets preempted.
batch.run(wait=False)
Пример #30
0
import os
import sys
import hail as hl
import hailtop.batch as hb
from analysis_runner import dataproc

OUTPUT = os.getenv('OUTPUT')
assert OUTPUT

hl.init(default_reference='GRCh38')

POP = sys.argv[1] if len(sys.argv) > 1 else 'nfe'

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name=f'{POP} kccg-reprocessed', backend=service_backend)

dataproc.hail_dataproc_job(
    batch,
    f'project_reprocessed_kccg_samples.py --output={OUTPUT} --pop {POP}',
    max_age='5h',
    packages=['click', 'selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_phantomjs.sh'],
    job_name=f'{POP}-kccg-reprocessed',
)

batch.run()