示例#1
0
def main(df_x_path, df_y_path, output_path, python_image):
    backend = hb.ServiceBackend()
    b = hb.Batch(name='rf-loo', default_python_image=python_image)

    with hl.hadoop_open(df_y_path) as f:
        local_df_y = pd.read_table(f, header=0, index_col=0)

    df_x_input = b.read_input(df_x_path)
    df_y_input = b.read_input(df_y_path)

    results = []

    for window in local_df_y.index.to_list():
        checkpoint = checkpoint_path(window)
        if hl.hadoop_exists(checkpoint):
            result = b.read_input(checkpoint)
            results.append(result)
            continue

        j = b.new_python_job()

        result = j.call(random_forest, df_x_input, df_y_input, window)
        tsv_result = j.call(as_tsv, result)
        tsv_result = tsv_result.as_str()

        b.write_output(tsv_result, checkpoint)
        results.append(tsv_result)

    output = hb.concatenate(b, results)
    b.write_output(output, output_path)

    b.run(wait=False)
    backend.close()
示例#2
0
def run_batch(args, batch_name=None):
    """Wrapper around creating, running, and then closing a Batch run.

    :param args: Parsed args from the ArgumentParser created via the init_arg_parser method
    :param batch_name: (optional) batch label which will show up in the Batch web UI

    Usage:
        with run_batch(args) as batch:
            ... batch job definitions ...
    """

    if args.local:
        backend = (hb.LocalBackend() if args.raw else hb.LocalBackend(
            gsa_key_file=args.gsa_key_file))
    else:
        backend = hb.ServiceBackend(billing_project=args.batch_billing_project,
                                    bucket=args.batch_temp_bucket)

    try:
        batch = hb.Batch(backend=backend, name=batch_name)

        batch.batch_utils_temp_bucket = args.batch_temp_bucket

        yield batch  # returned to with ... as batch:

        # run on end of with..: block
        batch.run(dry_run=args.dry_run, verbose=args.verbose)

    finally:
        if isinstance(backend, hb.ServiceBackend):
            backend.close()
示例#3
0
def stress():
    b = hb.Batch(name='stress',
                 backend=hb.ServiceBackend(billing_project='hail'),
                 default_image=DOCKER_ROOT_IMAGE)

    for i in range(100):
        j = b.new_job(name=f'parent_{i}')
        d = random.choice(range(4))
        if flip(0.2):
            j.command(f'sleep {d}; exit 1')
        else:
            j.command(f'sleep {d}; echo parent {i}')

        for k in range(10):
            d = random.choice(range(4))
            c = b.new_job(name=f'child_{i}_{k}').command(
                f'sleep {d}; echo child {i} {k}')
            c.depends_on(j)
            if flip(0.1):
                c._always_run = True
            if flip(0.01):
                c._machine_type = 'n1-standard-1'
                if flip(0.5):
                    c._preemptible = False

    b.run(open=False, wait=False)
示例#4
0
def main():
    """
    Create a Hail Batch
    analysis-runner helper creates a DataProc cluster, add the job
    Set off the batch
    """

    service_backend = hb.ServiceBackend(
        billing_project=os.getenv('HAIL_BILLING_PROJECT'),
        bucket=os.getenv('HAIL_BUCKET'),
    )

    # create a hail batch
    batch = hb.Batch(name='cohort_mt_extraction', backend=service_backend)

    _my_job = dataproc.hail_dataproc_job(
        batch=batch,
        script=' '.join(sys.argv[1:]),
        max_age='4h',
        job_name='extract_from_cohort_mt',
        num_secondary_workers=4,
        cluster_name='cohort_mt_extraction with max-age=4h',
    )  # noqa: F841

    batch.run(wait=False)
示例#5
0
def validate_all_objects_in_directory(gs_dir):
    """Validate files with MD5s in the provided gs directory"""
    backend = hb.ServiceBackend(
        billing_project=os.getenv('HAIL_BILLING_PROJECT'),
        bucket=os.getenv('HAIL_BUCKET'),
    )
    b = hb.Batch('validate_md5s', backend=backend)
    client = storage.Client()

    if not gs_dir.startswith('gs://'):
        raise ValueError(f'Expected GS directory, got: {gs_dir}')

    bucket_name, *components = gs_dir[5:].split('/')

    blobs = client.list_blobs(bucket_name, prefix='/'.join(components))
    files: Set[str] = {f'gs://{bucket_name}/{blob.name}' for blob in blobs}
    for obj in files:
        if obj.endswith('.md5'):
            continue
        if f'{obj}.md5' not in files:
            continue

        job = b.new_job(f'validate_{os.path.basename(obj)}')
        job.image(DRIVER_IMAGE)
        validate_md5(job, obj)

    b.run(wait=False)
示例#6
0
def main():
    
    use_tabix = True
    
    hl.init(log='/Users/nbaya/Downloads/get_chr_pos.log')
    backend = hb.ServiceBackend(billing_project='ukb_diverse_pops',
                                bucket='ukbb-diverse-temp-30day/nb-batch-tmp')
    
    b = hb.Batch(name='get_chr_pos', backend=backend,
                 default_image='gcr.io/ukbb-diversepops-neale/nbaya_tabix:latest',
                 default_storage='2G', default_cpu=1)

    
    paths = get_paths()
    
    for path in paths:
        print(path)
        annotate_chr_pos(b=b,
                         path=path,
                         use_tabix=use_tabix)
    
    b.run(open=True)
    
    backend.close()
示例#7
0
def main(script: str, mt: str):
    """
    runs a script inside dataproc to execute VEP
    :param script: str, the path to the VEP main script
    """

    service_backend = hb.ServiceBackend(
        billing_project=os.getenv('HAIL_BILLING_PROJECT'),
        bucket=os.getenv('HAIL_BUCKET'),
    )

    # create a hail batch
    batch = hb.Batch(name='run_vep_in_dataproc_cluster',
                     backend=service_backend)

    job = dataproc.hail_dataproc_job(
        batch=batch,
        worker_machine_type='n1-highmem-8',
        worker_boot_disk_size=200,
        secondary_worker_boot_disk_size=200,
        script=f'{script} --mt {mt}',
        max_age='12h',
        init=[
            'gs://cpg-reference/hail_dataproc/install_common.sh',
            'gs://cpg-reference/vep/vep-GRCh38.sh',
        ],
        job_name='run_vep',
        num_secondary_workers=20,
        num_workers=2,
        cluster_name='run vep',
    )
    job.cpu(2)
    job.memory('standard')
    job.storage('20G')

    batch.run(wait=False)
    merger = batch.new_job(name='merge-results')
    merger.image('ubuntu:18.04')
    if results:
        merger.command(f'''
head -n 1 {results[0]} > {merger.ofile}
for result in {" ".join(results)}
do
    tail -n +2 "$result" >> {merger.ofile}
done
sed -i '/^$/d' {merger.ofile}
''')
    return merger


if __name__ == '__main__':
    backend = hb.ServiceBackend()
    batch = hb.Batch(backend=backend, name='clumping')

    vcf = batch.read_input('gs://hail-tutorial/1kg.vcf.bgz')
    vcf.add_extension('.vcf.bgz')

    phenotypes = batch.read_input('gs://hail-tutorial/1kg_annotations.txt')

    g = gwas(batch, vcf, phenotypes)

    results = []
    for chr in range(1, 23):
        c = clump(batch, g.ofile, g.ofile.assoc, chr)
        results.append(c.clumped)

    m = merge(batch, results)
示例#9
0
    )
    parser.add_argument(
        '--dbsnp-vcf-ind',
        default=
        'gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx'
    )
    parser.add_argument('--contamination', type=float, default=0.0)
    parser.add_argument('--scatter-count', type=int, default=50)
    parser.add_argument('--out-dir', default='gs://african-seq-data')

    args = parser.parse_args()

    if args.local:
        backend = hb.LocalBackend()
    else:
        backend = hb.ServiceBackend(billing_project=args.billing_project,
                                    bucket=args.bucket)

    ref_fasta_size = bytes_to_gb(args.ref_fasta)
    ref_dict_size = bytes_to_gb(args.ref_dict)
    ref_ind_size = bytes_to_gb(args.ref_index)

    scatter = hb.Batch(backend=backend, name='scatter-interval-list')
    calling_interval_list = scatter.read_input(args.calling_interval_list)
    scatter_intervals = scatter_interval_list(
        b=scatter,
        interval_list_file=calling_interval_list,
        out_dir=args.out_dir,
        scatter_count=args.scatter_count)
    scatter.run()

    interval_files = hl.utils.hadoop_ls(f'{args.out_dir}/scatter-intervals/**')
示例#10
0
def main():
    backend = hb.ServiceBackend(billing_project='ukb_diverse_pops',
                                bucket='ukbb-diverse-temp-30day/nb-batch-tmp')
    #    backend = batch.LocalBackend(tmp_dir='/tmp/batch/')

    high_quality = True
    not_pop = False
    max_pops = True

    paridx = 0
    parsplit = 1

    p = hb.batch.Batch(name=(
        f'clump{"-hq" if high_quality else ""}{"-max_pops" if max_pops else ""}'
        + f'-{paridx}-{parsplit}'),
                       backend=backend,
                       default_image=
                       'gcr.io/ukbb-diversepops-neale/nbaya_plink:0.1',
                       default_storage='500Mi',
                       default_cpu=8)

    ## download hail script to VM
    hail_script = p.read_input(
        f'{ldprune_dir}/scripts/python/plink_clump_hail.py')

    ## get phenotype manifest
    pheno_manifest = hl.import_table(
        f'{ldprune_dir}/phenotype_manifest.tsv.bgz', impute=True)
    pheno_manifest = pheno_manifest.to_pandas()

    if max_pops:
        pheno_list = get_pheno_list(pheno_manifest=pheno_manifest,
                                    pop=None,
                                    not_pop=False,
                                    max_pops=True)
    else:
        pheno_list = []
        for not_pop in [True, False]:
            for pop in POPS:

                pheno_list.append(
                    get_pheno_list(pheno_manifest=pheno_manifest,
                                   pop=pop,
                                   not_pop=not_pop))

    idx_to_run = range(paridx, len(pheno_list), parsplit)
    pheno_list = [
        pheno_info for idx, pheno_info in enumerate(pheno_list)
        if idx in idx_to_run
    ]
    print(f'Number of phens: {len(pheno_list)}')

    for trait_type, phenocode, pheno_sex, coding, modifier, pops, pop, not_pop, max_pops in pheno_list:
        pheno_key_dict = get_pheno_key_dict(trait_type, phenocode, pheno_sex,
                                            coding, modifier, pops)
        pheno_id = get_pheno_id(trait_type, phenocode, pheno_sex, coding,
                                modifier)

        get_adj_betas(p=p,
                      pop=pop,
                      not_pop=not_pop,
                      max_pops=max_pops,
                      pheno_key_dict=pheno_key_dict,
                      high_quality=high_quality,
                      pheno_id=pheno_id,
                      hail_script=hail_script)

    p.run(open=True)

    #    if type(backend)==batch.ServiceBackend:
    #        print('running')
    #    else:
    #        p.run(verbose=True,
    #              delete_scratch_on_exit=True)

    backend.close()
示例#11
0
文件: submit.py 项目: TileDB-Inc/hail
def submit(hail_code: Commit,
           benchmark_code: Commit,
           test_names: Set[str],
           n_replicates: int,
           n_iters: int):

    sync_check_shell(benchmark_code.checkout_script())

    sys.path.insert(0, f'{benchmark_code.repo_dir()}/benchmark/python/benchmark_hail')

    importlib.invalidate_caches  # pylint: disable=pointless-statement
    from benchmark_hail.run.resources import all_resources  # pylint: disable=import-error, import-outside-toplevel
    from benchmark_hail.run.utils import list_benchmarks  # pylint: disable=import-error, import-outside-toplevel

    output_file = f'gs://hail-benchmarks-2/benchmark/{hail_code.sha}-{benchmark_code.sha}.json'

    b = hb.Batch(name=f'benchmark-{hail_code.sha}',
                 backend=hb.ServiceBackend(billing_project='hail'),
                 default_image=BENCHMARK_IMAGE,
                 default_cpu='2',
                 attributes={'output_file': output_file,
                             'n_replicates': str(n_replicates),
                             'n_iters': str(n_iters),
                             'image': str(BENCHMARK_IMAGE),
                             'hail_code': str(hail_code),
                             'benchmark_code': str(benchmark_code)})

    build_hail = b.new_job('build_hail_wheel')
    build_hail.command(f'''
 set -ex
 { hail_code.checkout_script() }
 cd hail
 time ./gradlew --version
 time make wheel
 time (cd python && zip -r hail.zip hail hailtop)
 (cd build/deploy/dist/ && tar -cvf wheel-container.tar hail-*-py3-none-any.whl)
 cp build/deploy/dist/hail-*-py3-none-any.whl {build_hail.wheel}
''')

    build_benchmark = b.new_job('build_benchmark_wheel')
    build_benchmark.command(f'''
 set -ex
 {benchmark_code.checkout_script()}
 make -C hail python/hail/hail_pip_version
 export HAIL_VERSION=$(cat hail/python/hail/hail_pip_version)
 export HAIL_BENCHMARK_VERSION=$HAIL_VERSION
 cd benchmark/python/ && python3 setup.py -q bdist_wheel
 python3 -m pip -q install dist/benchmark_hail-$HAIL_VERSION-py3-none-any.whl
 cp dist/benchmark_hail-$HAIL_VERSION-py3-none-any.whl {build_benchmark.wheel}
''')
    resource_jobs = {}
    for r in all_resources:
        j = b.new_job(f'create_resource_{r.name()}').cpu(4)
        j.command(f'mv {build_hail.wheel} hail--py3-none-any.whl')
        j.command('pip install hail--py3-none-any.whl')
        j.command(f'mv {build_benchmark.wheel} benchmark_hail-$HAIL_VERSION-py3-none-any.whl')
        j.command('pip install benchmark_hail-$HAIL_VERSION-py3-none-any.whl')
        j.command(f'hail-bench create-resources --data-dir benchmark-resources --group {r.name()}')
        j.command(f"time tar -cf {r.name()}.tar benchmark-resources/{r.name()} --exclude='*.crc'")
        j.command(f'ls -lh {r.name()}.tar')
        j.command(f'mv {r.name()}.tar {j.ofile}')
        resource_jobs[r] = j

    all_benchmarks = list_benchmarks()
    assert len(all_benchmarks) > 0

    all_output = []

    n_passed_filter = 0
    job_fs = []
    for benchmark in all_benchmarks:
        if benchmark.name in test_names:
            n_passed_filter += 1
            for replicate in range(n_replicates):
                job_fs.append((benchmark.name, replicate, benchmark.groups))

    log.info(f'generating {n_passed_filter} * {n_replicates} = {n_passed_filter * n_replicates} individual benchmark jobs')

    random.shuffle(job_fs)
    for name, replicate, groups in job_fs:
        j = b.new_job(name=f'{name}_{replicate}')
        j.command(f'mv {build_hail.wheel} hail--py3-none-any.whl')
        j.command('pip install hail--py3-none-any.whl')
        j.command(f'mv {build_benchmark.wheel} benchmark_hail--py3-none-any.whl')
        j.command('pip install benchmark_hail--py3-none-any.whl')
        j.command('mkdir -p benchmark-resources')
        for resource_group in groups:
            resource_job = resource_jobs[resource_group]
            j.command(f'mv {resource_job.ofile} benchmark-resources/{resource_group.name()}.tar')
            j.command(f'time tar -xf benchmark-resources/{resource_group.name()}.tar')
        j.command(f'MKL_NUM_THREADS=1'
                  f'OPENBLAS_NUM_THREADS=1'
                  f'OMP_NUM_THREADS=1'
                  f'VECLIB_MAXIMUM_THREADS=1'
                  f'PYSPARK_SUBMIT_ARGS="--driver-memory 6G pyspark-shell" '
                  f'hail-bench run -o {j.ofile} -n {n_iters} --data-dir benchmark-resources -t {name}')
        all_output.append(j.ofile)

    combine_branch_factor = int(os.environ.get('BENCHMARK_BRANCH_FACTOR', 32))
    phase_i = 1
    while len(all_output) > combine_branch_factor:
        new_output = []

        job_i = 1
        i = 0
        while i < len(all_output):
            combine = b.new_job(f'combine_output_phase{phase_i}_job{job_i}')
            combine.command(f'mv {build_hail.wheel} hail--py3-none-any.whl')
            combine.command('pip install hail--py3-none-any.whl')
            combine.command(f'mv {build_benchmark.wheel} benchmark_hail--py3-none-any.whl')
            combine.command('pip install benchmark_hail--py3-none-any.whl')
            combine.command(
                f'hail-bench combine -o {combine.ofile} ' + ' '.join(all_output[i:i + combine_branch_factor]))
            new_output.append(combine.ofile)
            i += combine_branch_factor
            job_i += 1

        phase_i += 1
        all_output = new_output
    combine = b.new_job('final_combine_output')
    combine.command(f'mv {build_hail.wheel} hail--py3-none-any.whl')
    combine.command('pip install hail--py3-none-any.whl')
    combine.command(f'mv {build_benchmark.wheel} benchmark_hail--py3-none-any.whl')
    combine.command('pip install benchmark_hail--py3-none-any.whl')
    combine.command(f'hail-bench combine -o {combine.ofile} ' + ' '.join(all_output))
    combine.command(f'cat {combine.ofile}')

    log.info(f'writing output to {output_file}')

    b.write_output(combine.ofile, output_file)
    b.run()
示例#12
0
import hailtop.batch as hb
import sys

# backend = hb.ServiceBackend('cpg-ci', 'cpg-ci')
backend = hb.ServiceBackend('vladislavsavelyev-trial', 'playground-au')

b = hb.Batch(backend=backend, name='test_ci')

j = b.new_job(name='hello')
j.image(sys.argv[1])
j.command('''
gcloud -q auth activate-service-account --key-file=/gsa-key/key.json
python install-gcs-connector -k /gsa-key/key.json

BUCKET=gs://playground-au/test_ci
vcf2mt "${BUCKET}/toy.g.vcf.bgz" "${BUCKET}/toy.mt"
gsutil ls "${BUCKET}/toy.mt/_SUCCESS"
''')

b.run(open=True)

示例#13
0
parser.add_argument(
    'n',
    type=int,
    help='the max number of machines that can be provisioned for this batch')
parser.add_argument(
    'duration',
    type=int,
    help='how long the batch driver will schedule jobs (in sec)')

args = parser.parse_args()
msr = args.msr
n = args.n
duration = args.duration

# Need to set hailctl remote_tmpdir
backend = hb.ServiceBackend('test')
b = hb.Batch(backend=backend, name='load-test')

# the number of quarter-core jobs that can be running on n machines (assuming each machines has 16 cores)
max_concurrent_quarter_core_jobs = 64 * n
# the amount of time for which each job sleeps
sleep_time = math.floor(max_concurrent_quarter_core_jobs / msr)
# the number of jobs is the max scheduling rate times the scheduling duration
n_jobs = msr * duration

for idx in range(n_jobs):
    j = b.new_job(name=f'job_{idx}')
    j.cpu('250m')
    j.command(f'sleep {sleep_time}')

b.run()
def main():
    rnaseq_sample_metadata_df = get_joined_metadata_df()

    p = argparse.ArgumentParser()
    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument("--local", action="store_true", help="Batch: run locally")
    grp.add_argument("--cluster",
                     action="store_true",
                     help="Batch: submit to cluster")
    p.add_argument(
        "--batch-billing-project",
        default="tgg-rare-disease",
        help="Batch: billing project. Required if submitting to cluster.")
    p.add_argument("--batch-job-name", help="Batch: (optional) job name")

    p.add_argument(
        "-f",
        "--force",
        action="store_true",
        help="Recompute and overwrite cached or previously computed data")
    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument("-b",
                     "--rnaseq-batch-name",
                     nargs="*",
                     help="RNA-seq batch names to process",
                     choices=set(
                         rnaseq_sample_metadata_df['star_pipeline_batch']))
    grp.add_argument("-s",
                     "--rnaseq-sample-id",
                     nargs="*",
                     help="RNA-seq sample IDs to process",
                     choices=set(rnaseq_sample_metadata_df['sample_id']))
    args = p.parse_args()

    #logger.info("\n".join(df.columns))

    if args.rnaseq_batch_name:
        batch_names = args.rnaseq_batch_name
        sample_ids = rnaseq_sample_metadata_df[rnaseq_sample_metadata_df[
            'star_pipeline_batch'].isin(batch_names)].sample_id
    elif args.rnaseq_sample_id:
        sample_ids = args.rnaseq_sample_id

    logger.info(
        f"Processing {len(sample_ids)} sample ids: {', '.join(sample_ids)}")

    # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details
    if args.local:
        backend = hb.LocalBackend(gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    else:
        backend = hb.ServiceBackend(args.batch_billing_project)

    b = hb.Batch(backend=backend, name=args.batch_job_name)

    # define workflow inputs
    if args.local:
        genes_gtf = b.read_input("gencode.v26.annotation.gff3",
                                 extension=".gff3")
    else:
        genes_gtf = b.read_input(
            "gs://macarthurlab-rnaseq/ref/gencode.v26.annotation.GRCh38.gff3",
            extension=".gff3")

    # define parallel execution for samples
    for sample_id in sample_ids:
        metadata_row = rnaseq_sample_metadata_df.loc[sample_id]
        batch_name = metadata_row['star_pipeline_batch']

        # set job inputs & outputs
        input_read_data = b.read_input_group(
            bam=metadata_row['star_bam'],
            bai=metadata_row['star_bai'],
        )

        output_dir = f"gs://macarthurlab-rnaseq/{batch_name}/majiq_build/"
        output_file_path = os.path.join(output_dir,
                                        f"majiq_build_{sample_id}.tar.gz")

        # check if output file already exists
        if hl.hadoop_is_file(output_file_path) and not args.force:
            logger.info(
                f"{sample_id} output file already exists: {output_file_path}. Skipping..."
            )
            continue

        file_stats = hl.hadoop_stat(metadata_row['star_bam'])
        bam_size = int(round(file_stats['size_bytes'] / 10.**9))

        # define majiq build commands for this sample
        j = b.new_job(name=args.batch_job_name)
        j.image("weisburd/majiq:latest")
        j.storage(f'{bam_size*3}Gi')
        j.cpu(1)  # default: 1
        j.memory("15G")  # default: 3.75G
        logger.info(
            f'Requesting: {j._storage or "default"} storage, {j._cpu or "default"} CPU, {j._memory or "default"} memory'
        )

        # switch to user account
        j.command(
            f"gcloud auth activate-service-account --key-file /gsa-key/key.json"
        )
        j.command(
            f"gsutil -m cp -r {GCLOUD_CREDENTIALS_LOCATION}/.config /tmp/")
        j.command(f"rm -rf ~/.config")
        j.command(f"mv /tmp/.config ~/")
        j.command(f"gcloud config set account {GCLOUD_USER_ACCOUNT}")
        j.command(f"gcloud config set project {GCLOUD_PROJECT}")

        # run majiq build
        #j.command(f"gsutil -u {GCLOUD_PROJECT} -m cp gs://gtex-resources/GENCODE/gencode.v26.GRCh38.ERCC.genes.collapsed_only.gtf .")
        j.command(f"mv {genes_gtf} gencode.gff3")
        j.command(f"mv {input_read_data.bam} {sample_id}.bam")
        j.command(f"mv {input_read_data.bai} {sample_id}.bam.bai")

        j.command(f"echo '[info]' >> majiq_build.cfg")
        j.command(
            f"echo 'readlen={metadata_row['read length (rnaseqc)']}' >> majiq_build.cfg"
        )
        j.command(f"echo 'bamdirs=.' >> majiq_build.cfg")
        j.command(f"echo 'genome=hg38' >> majiq_build.cfg")
        j.command(
            f"echo 'strandness={'None' if metadata_row['stranded? (rnaseqc)'] == 'no' else 'reverse'}' >> majiq_build.cfg"
        )
        j.command(f"echo '[experiments]' >> majiq_build.cfg")
        j.command(f"echo '{sample_id}={sample_id}' >> majiq_build.cfg")

        j.command(f"cat majiq_build.cfg >> {j.logfile}")
        j.command(
            f"majiq build gencode.gff3 -c majiq_build.cfg -j 1 -o majiq_build_{sample_id} >> {j.logfile}"
        )

        j.command(
            f"tar czf majiq_build_{sample_id}.tar.gz majiq_build_{sample_id}")
        j.command(f"cp majiq_build_{sample_id}.tar.gz {j.output_tar_gz}")

        #j.command(f"ls -lh . >> {j.logfile}")
        #j.command(f"echo ls majiq_build_{sample_id} >> {j.logfile}")
        #j.command(f"ls -1 majiq_build_{sample_id} >> {j.logfile}")
        j.command(f"echo --- done  {output_file_path} >> {j.logfile}")

        # copy output
        b.write_output(j.output_tar_gz, output_file_path)
        b.write_output(
            j.logfile, os.path.join(output_dir,
                                    f"majiq_build_{sample_id}.log"))

    b.run()

    if isinstance(backend, hb.ServiceBackend):
        backend.close()
示例#15
0
    async def cromwell(request):  # pylint: disable=too-many-locals
        """
        Checks out a repo, and POSTs the designated workflow to the cromwell server.
        Returns a hail batch link, eg: 'batch.hail.populationgenomics.org.au/batches/{batch}'
        ---
        :param output: string
        :param dataset: string
        :param accessLevel: string
        :param repo: string
        :param commit: string
        :param cwd: string (to set the working directory, relative to the repo root)
        :param description: string (Description of the workflow to run)
        :param workflow: string (the relative path of the workflow (from the cwd))
        :param input_json_paths: List[string] (the relative path to an inputs.json (from the cwd). Currently only supports one inputs.json)
        :param dependencies: List[string] (An array of directories (/ files) to zip for the '-p / --tools' input to "search for workflow imports")
        :param wait: boolean (Wait for workflow to complete before returning, could yield long response times)
        """
        email = get_email_from_request(request)
        # When accessing a missing entry in the params dict, the resulting KeyError
        # exception gets translated to a Bad Request error in the try block below.
        params = await request.json()

        dataset = params['dataset']
        access_level = params['accessLevel']
        server_config = get_server_config()
        output_dir = validate_output_dir(params['output'])
        check_dataset_and_group(server_config, dataset, email)
        repo = params['repo']
        check_allowed_repos(server_config, dataset, repo)
        labels = params.get('labels')

        ds_config = server_config[dataset]
        project = ds_config.get('projectId')
        hail_token = ds_config.get(f'{access_level}Token')

        if not hail_token:
            raise web.HTTPBadRequest(
                reason=f"Invalid access level '{access_level}', couldn't find corresponding hail token"
            )

        # use the email specified by the service_account_json again

        hail_bucket = f'cpg-{dataset}-hail'

        commit = params['commit']
        if not commit or commit == 'HEAD':
            raise web.HTTPBadRequest(reason='Invalid commit parameter')

        libs = params.get('dependencies')
        if not isinstance(libs, list):
            raise web.HTTPBadRequest(reason='Expected "dependencies" to be a list')
        cwd = params['cwd']
        wf = params['workflow']
        if not wf:
            raise web.HTTPBadRequest(reason='Invalid script parameter')

        input_jsons = params.get('input_json_paths') or []
        input_dict = params.get('inputs_dict')

        if access_level == 'test':
            workflow_output_dir = f'gs://cpg-{dataset}-test/{output_dir}'
        else:
            workflow_output_dir = f'gs://cpg-{dataset}-main/{output_dir}'

        # This metadata dictionary gets stored at the output_dir location.
        timestamp = datetime.now().astimezone().isoformat()
        metadata = get_analysis_runner_metadata(
            timestamp=timestamp,
            dataset=dataset,
            user=email,
            access_level=access_level,
            repo=repo,
            commit=commit,
            script=wf,
            description=params['description'],
            output_suffix=workflow_output_dir,
            driver_image=DRIVER_IMAGE,
            cwd=cwd,
            mode='cromwell',
        )

        user_name = email.split('@')[0]
        batch_name = f'{user_name} {repo}:{commit}/cromwell/{wf}'
        backend = hb.ServiceBackend(
            billing_project=dataset,
            bucket=hail_bucket,
            token=hail_token,
        )

        batch = hb.Batch(
            backend=backend, name=batch_name, requester_pays_project=project
        )

        job = batch.new_job(name='driver')
        job = prepare_git_job(
            job=job,
            repo_name=repo,
            commit=commit,
            print_all_statements=False,
            is_test=access_level == 'test',
        )

        write_metadata_to_bucket(
            job,
            access_level=access_level,
            dataset=dataset,
            output_suffix=output_dir,
            metadata_str=json.dumps(metadata),
        )
        job.image(DRIVER_IMAGE)

        job.env('DRIVER_IMAGE', DRIVER_IMAGE)
        job.env('DATASET', dataset)
        job.env('ACCESS_LEVEL', access_level)
        job.env('OUTPUT', output_dir)

        run_cromwell_workflow(
            job=job,
            dataset=dataset,
            access_level=access_level,
            workflow=wf,
            cwd=cwd,
            libs=libs,
            labels=labels,
            output_suffix=output_dir,
            input_dict=input_dict,
            input_paths=input_jsons,
            project=project,
        )

        url = run_batch_job_and_print_url(batch, wait=params.get('wait', False))

        # Publish the metadata to Pub/Sub.
        metadata['batch_url'] = url
        publisher.publish(PUBSUB_TOPIC, json.dumps(metadata).encode('utf-8')).result()

        return web.Response(text=f'{url}\n')
示例#16
0
def haplotype_phasing(input_vcf: str = None,
                      vcf_ref: str = None,
                      local: bool = False,
                      billing_project: str = None,
                      software: str = 'shapeit',
                      reference: str = 'GRCh38',
                      max_win_size_cm: float = 10.0,
                      overlap_size_cm: float = 2.0,
                      scatter_memory: int = 26,
                      cpu: int = 8,
                      threads: int = 7,
                      stages: str = 'scatter,phase,concat',
                      output_type: str = 'bcf',
                      out_dir: str = None):
    # Error handling
    if not out_dir:
        raise SystemExit(
            'Output directory not specified. Specify using --out_dir if running from the command line or'
            'out_dir argument if running inside a Python script')

    steps_list = stages.split(',')
    steps = [x.lower() for x in steps_list]
    unknown_steps = [
        i for i in steps if i not in ['scatter', 'phase', 'concat']
    ]

    if len(unknown_steps) > 0:
        raise SystemExit(
            f'Incorrect process(es) {unknown_steps} selected. Options are [scatter, phase, concat]'
        )

    if output_type.lower() not in ['bcf', 'vcf']:
        raise SystemExit(
            f'Incorrect output type {output_type} selected. Options are [bcf, vcf]'
        )

    if local:
        backend = hb.LocalBackend()
    else:
        backend = hb.ServiceBackend(billing_project=billing_project,
                                    remote_tmpdir=f'{out_dir}/tmp/')

    # Scatter VCF/BCF file(s)
    if 'scatter' in steps:
        from gwaspy.phasing.scatter_vcf import run_scatter
        run_scatter(backend=backend,
                    input_vcf=input_vcf,
                    reference=reference,
                    max_win_size_cm=max_win_size_cm,
                    overlap_size_cm=overlap_size_cm,
                    scatter_memory=scatter_memory,
                    out_dir=out_dir)

    # Phase scatterd chunks
    if 'phase' in steps:
        from gwaspy.phasing.phase_vcf import run_phase
        run_phase(backend=backend,
                  input_vcf=input_vcf,
                  vcf_ref_path=vcf_ref,
                  software=software,
                  reference=reference,
                  cpu=cpu,
                  threads=threads,
                  out_dir=out_dir)

    # Concatenate phased chunks
    if 'concat' in steps:
        from gwaspy.phasing.concat_vcfs import run_concat
        run_concat(backend=backend,
                   input_vcf=input_vcf,
                   output_type=output_type,
                   reference=reference,
                   software=software,
                   out_dir=out_dir)
示例#17
0
async def index(request):
    """Main entry point, responds to the web root."""

    email = get_email_from_request(request)
    # When accessing a missing entry in the params dict, the resulting KeyError
    # exception gets translated to a Bad Request error in the try block below.
    params = await request.json()

    server_config = get_server_config()
    output_suffix = validate_output_dir(params['output'])
    dataset = params['dataset']
    check_dataset_and_group(server_config, dataset, email)
    repo = params['repo']
    check_allowed_repos(server_config, dataset, repo)
    environment_variables = params.get('environmentVariables')

    access_level = params['accessLevel']
    hail_token = server_config[dataset].get(f'{access_level}Token')
    if not hail_token:
        raise web.HTTPBadRequest(reason=f'Invalid access level "{access_level}"')

    hail_bucket = f'cpg-{dataset}-hail'
    backend = hb.ServiceBackend(
        billing_project=dataset,
        bucket=hail_bucket,
        token=hail_token,
    )

    commit = params['commit']
    if not commit or commit == 'HEAD':
        raise web.HTTPBadRequest(reason='Invalid commit parameter')

    cwd = params['cwd']
    script = params['script']
    if not script:
        raise web.HTTPBadRequest(reason='Invalid script parameter')

    if not isinstance(script, list):
        raise web.HTTPBadRequest(reason='Script parameter expects an array')

    # This metadata dictionary gets stored in the metadata bucket, at the output_dir location.
    hail_version = await _get_hail_version()
    timestamp = datetime.datetime.now().astimezone().isoformat()
    metadata = get_analysis_runner_metadata(
        timestamp=timestamp,
        dataset=dataset,
        user=email,
        access_level=access_level,
        repo=repo,
        commit=commit,
        script=' '.join(script),
        description=params['description'],
        output_suffix=output_suffix,
        hailVersion=hail_version,
        driver_image=DRIVER_IMAGE,
        cwd=cwd,
    )

    user_name = email.split('@')[0]
    batch_name = f'{user_name} {repo}:{commit}/{" ".join(script)}'

    dataset_gcp_project = server_config[dataset]['projectId']
    batch = hb.Batch(
        backend=backend, name=batch_name, requester_pays_project=dataset_gcp_project
    )

    job = batch.new_job(name='driver')
    job = prepare_git_job(
        job=job, repo_name=repo, commit=commit, is_test=access_level == 'test'
    )
    write_metadata_to_bucket(
        job,
        access_level=access_level,
        dataset=dataset,
        output_suffix=output_suffix,
        metadata_str=json.dumps(metadata),
    )
    job.image(DRIVER_IMAGE)
    job.env('DRIVER_IMAGE', DRIVER_IMAGE)
    job.env('DATASET', dataset)
    job.env('ACCESS_LEVEL', access_level)
    job.env('HAIL_BUCKET', hail_bucket)
    job.env('HAIL_BILLING_PROJECT', dataset)
    job.env('DATASET_GCP_PROJECT', dataset_gcp_project)
    job.env('OUTPUT', output_suffix)

    if environment_variables:
        if not isinstance(environment_variables, dict):
            raise ValueError('Expected environment_variables to be dictionary')

        invalid_env_vars = [
            f'{k}={v}'
            for k, v in environment_variables.items()
            if not isinstance(v, str)
        ]

        if len(invalid_env_vars) > 0:
            raise ValueError(
                'Some environment_variables values were not strings, got '
                + ', '.join(invalid_env_vars)
            )

        for k, v in environment_variables.items():
            job.env(k, v)

    if cwd:
        job.command(f'cd {quote(cwd)}')

    job.command(f'which {quote(script[0])} || chmod +x {quote(script[0])}')

    # Finally, run the script.
    escaped_script = ' '.join(quote(s) for s in script if s)
    job.command(escaped_script)

    url = run_batch_job_and_print_url(batch, wait=params.get('wait', False))

    # Publish the metadata to Pub/Sub.
    metadata['batch_url'] = url
    publisher.publish(PUBSUB_TOPIC, json.dumps(metadata).encode('utf-8')).result()

    return web.Response(text=f'{url}\n')
示例#18
0
"""
import os
import hailtop.batch as hb
from analysis_runner.cromwell import (
    run_cromwell_workflow_from_repo_and_get_outputs,
    CromwellOutputType,
)

OUTPUT_SUFFIX = 'mfranklin/analysis-runner-test/out/'
DATASET = os.getenv('DATASET')
BUCKET = os.getenv('HAIL_BUCKET')
OUTPUT_PATH = os.path.join(f'gs://{BUCKET}', OUTPUT_SUFFIX)
BILLING_PROJECT = os.getenv('HAIL_BILLING_PROJECT')
ACCESS_LEVEL = os.getenv('ACCESS_LEVEL')

sb = hb.ServiceBackend(billing_project=BILLING_PROJECT, bucket=BUCKET)
b = hb.Batch(backend=sb, default_image=os.getenv('DRIVER_IMAGE'))

inputs = ['Hello, analysis-runner ;)', 'Hello, second output!']

workflow_outputs = run_cromwell_workflow_from_repo_and_get_outputs(
    b=b,
    job_prefix='hello',
    workflow='hello_all_in_one_file.wdl',
    cwd='examples/cromwell',
    input_dict={'hello.inps': inputs},
    outputs_to_collect={
        'joined_out':
        CromwellOutputType.single('hello.joined_out'),
        'outs':
        CromwellOutputType.array('hello.outs', len(inputs)),
示例#19
0
import os

import hailtop.batch as hb 

backend = hb.ServiceBackend(
    billing_project='leonhardgruenschloss-trial',
    bucket='leo-tmp-au')

b = hb.Batch(backend=backend, name='outer') 

j = b.new_job(name='launch-inner')
j.image(f'gcr.io/{os.getenv("GCP_PROJECT")}/hail-batch-nested:latest')
j.command('python3 inner.py')

b.run()
示例#20
0
        BATCH_DEMO_IMAGE = 'batch-demo:latest'
        backend = hb.LocalBackend()
    else:
        # TODO: Fill in the location of your demo image in GCR
        # Fill this in when running LD-clumping on the service
        # This should look something like gcr.io/atgu-training/batch-demo-<user>:latest
        BATCH_DEMO_IMAGE = 'gcr.io/atgu-training/batch-demo-jigold:latest'

        # TODO: Fill in the name of <YOUR_BILLING_PROJECT> and <YOUR_BUCKET>
        # Fill this in when running LD-clumping on the service        
        # The billing project for the workshop is 'atgu-welcome-workshop'.
        # The bucket is the name of the bucket that you configured your service account to have access to. Do not include the gs://
        # In the future, you can use hailctl config to set defaults for these parameters
        # `hailctl config set batch/billing_project my-billing-project`
        # `hailctl config set batch/bucket my-bucket        
        backend = hb.ServiceBackend(billing_project='atgu-welcome-workshop',
                                    bucket='batch-tmp-jigold')

    batch = hb.Batch(backend=backend,
                     name='clumping-demo')
    # Define inputs
    vcf = batch.read_input(args.vcf)
    # TODO: We want to read the input file for the phenotypes and make it an InputResourceFile
    # look at the vcf file above for an example of creating an InputResourceFile. The phenotypes
    # file is passed as `args.phenotypes`
    phenotypes = batch.read_input(args.phenotypes)

    # QC and compute gwas assoc results
    # TODO: Fill in the argument parameters to the `run_gwas` function
    # This will add a new job to the Batch `batch` that runs a GWAS in Hail
    # and exports the dataset to PLINK format. It also takes as arguments the batch to use, the name
    # of the Docker image to use, a VCF file and a file with the phenotypes.
示例#21
0
            f'usage: <script.py> DOCKER_IMAGE_URL BUCKET_BASE SHA N_REPLICATES N_ITERS'
        )
    BENCHMARK_IMAGE = sys.argv[1]
    BUCKET_BASE = sys.argv[2]
    SHA = sys.argv[3]
    N_REPLICATES = int(sys.argv[4])
    N_ITERS = int(sys.argv[5])

    labeled_sha = SHA
    label = os.environ.get('BENCHMARK_LABEL')
    if label:
        labeled_sha = f'{labeled_sha}-{label}'
    output_file = os.path.join(BUCKET_BASE, f'{labeled_sha}.json')

    b = hb.Batch(name=f'benchmark-{labeled_sha}',
                 backend=hb.ServiceBackend(billing_project='hail'),
                 default_image=BENCHMARK_IMAGE,
                 default_storage='100G',
                 default_memory='7G',
                 default_cpu=2,
                 attributes={
                     'output_file': output_file,
                     'n_replicates': str(N_REPLICATES),
                     'n_iters': str(N_ITERS),
                     'image': str(BENCHMARK_IMAGE)
                 })

    resource_tasks = {}
    for r in all_resources:
        j = b.new_job(f'create_resource_{r.name()}').cpu(4)
        j.command(
示例#22
0
def batch_split_by_chrom(args):
    r'''
    Splits bfiles by chromosome, for later use by plink_clump.py
    About $0.06 per population set
    '''

    hl.init(default_reference='GRCh38',
            spark_conf={
                'spark.hadoop.fs.gs.requester.pays.mode':
                'AUTO',
                'spark.hadoop.fs.gs.requester.pays.project.id':
                'ukbb-diversepops-neale'
            })

    pops_list = get_pops_list(args)

    n_max = 5000  # maximum number of samples in subset (equal to final sample size if there are sufficient samples for each population)
    subsets_dir = f'{bucket}/ld_prune/subsets_{round(n_max/1e3)}k'

    backend = hb.ServiceBackend(billing_project='ukb_diverse_pops',
                                bucket='ukbb-diverse-temp-30day/nb-batch-tmp')
    #    backend = batch.LocalBackend(tmp_dir='/tmp/batch/')

    b = hb.batch.Batch(
        name='split_by_chrom',
        backend=backend,
        default_image='gcr.io/ukbb-diversepops-neale/nbaya_plink:0.1',
        default_storage='30G',
        default_cpu=8)

    for pops in pops_list:
        pops_str = '-'.join(pops)
        bfile_prefix = f'{subsets_dir}/{pops_str}/{pops_str}'
        master_bfile_paths = [
            f'{bfile_prefix}.{suffix}' for suffix in ['bed', 'bim', 'fam']
        ]
        master_fam_path = f'{bfile_prefix}.fam'
        bfile_chr_paths = [
            f'{get_bfile_chr_path(bfile_prefix, chrom)}.{suffix}'
            for chrom in chroms for suffix in ['bed', 'bim']
        ]
        if not args.overwrite_plink and all(
                list(
                    map(hl.hadoop_is_file,
                        [master_fam_path] + bfile_chr_paths))):
            print(f'\nAll per-chrom PLINK files created for {pops_str}')
        else:
            if not all(map(hl.hadoop_is_file, master_bfile_paths)):
                print(
                    f'\nWARNING: Insufficient files for {pops_str} to split into per-chrom bed/bim files, skipping\n'
                )
                continue
            else:
                print(
                    f'\n... Running bfile per-chrom split for {pops_str} ...')
                prefix = f'{subsets_dir}/{pops_str}/{pops_str}'
                bfile = b.read_input_group(
                    **{
                        suffix: f'{prefix}.{suffix}'
                        for suffix in ['bed', 'bim', 'fam']
                    })
                split = b.new_job(name=f'split_by_chrom_{pops_str}')
                for chrom in chroms:
                    split.declare_resource_group(
                        **{
                            f'ofile_{chrom}': {
                                'bed': '{root}.bed',
                                'bim': '{root}.bim'
                            }
                        })  # exclude fam file to avoid redundancy
                    split.command(f'''
                        plink \\
                        --bfile {bfile} \\
                        --chr {chrom} \\
                        --output-chr M \\
                        --make-bed \\
                        --out {split[f"ofile_{chrom}"]}
                        ''')
                    # print(f"saving to {get_bfile_chr_path(bfile_prefix, chrom)}")
                    b.write_output(split[f'ofile_{chrom}'],
                                   get_bfile_chr_path(bfile_prefix, chrom))

    b.run(open=True)
    backend.close()
示例#23
0
文件: imputation.py 项目: atgu/GWASpy
def genotype_imputation(input_vcf: str = None,
                        females_file: str = None,
                        n_samples: int = None,
                        n_panel_samples: int = 4099,
                        buffer_region: int = 250,
                        phasing_software: str = None,
                        local: bool = False,
                        billing_project: str = None,
                        memory: str = 'highmem',
                        cpu: int = 16,
                        stages: str = 'impute,concat',
                        output_type: str = 'bcf',
                        out_dir: str = None):
    # Error handling
    if not out_dir:
        raise SystemExit(
            'Output directory not specified. Specify using --out_dir if running from the command line or'
            'out_dir argument if running inside a Python script')

    steps_list = stages.split(',')
    steps = [x.lower() for x in steps_list]
    unknown_steps = [i for i in steps if i not in ['impute', 'concat']]

    if len(unknown_steps) > 0:
        raise SystemExit(
            f'Incorrect process(es) {unknown_steps} selected. Options are [impute, concat]'
        )

    if output_type.lower() not in ['bcf', 'vcf']:
        raise SystemExit(
            f'Incorrect output type {output_type} selected. Options are [bcf, vcf]'
        )

    if memory.lower() not in ['lowmem', 'standard', 'highmem']:
        raise SystemExit(
            f'Incorrect memory type {memory} selected. Options are [lowmem, standard, highmem]'
        )

    if not n_samples:
        raise SystemExit(
            'Number of samples in input data not detected. Specify how many samples (integer), using'
            '--n-samples if running from the command line or'
            'n_samples argument if running inside a Python script, are in the input data'
        )

    if local:
        backend = hb.LocalBackend()
    else:
        backend = hb.ServiceBackend(billing_project=billing_project,
                                    remote_tmpdir=f'{out_dir}/tmp/')

    # impute genotypes
    if 'impute' in steps:
        from gwaspy.imputation.sex_aut_imp import run_impute
        run_impute(backend=backend,
                   input_vcf=input_vcf,
                   females_file=females_file,
                   n_samples=n_samples,
                   n_panel_samples=n_panel_samples,
                   phasing_software=phasing_software,
                   memory=memory,
                   buffer_region=buffer_region,
                   out_dir=out_dir)

    # Concatenate imputed chunks
    if 'concat' in steps:
        from gwaspy.imputation.concat_vcfs import run_concat
        run_concat(backend=backend,
                   input_vcf=input_vcf,
                   output_type=output_type,
                   cpu=cpu,
                   memory=memory,
                   out_dir=out_dir)
示例#24
0
def tabix(b, ss_path, out_dir):
    r'''
    tabix's a bgz file with gcloud path `path` using Batch `b`
    '''
    fname = ss_path.split('/')[-1]
    f = b.read_input(ss_path)
    j = b.new_job(name=fname.split('.')[0])
    j.command(f'tabix -s 1 -b 2 -e 2 -c chr {f}'
              )  # treat header (which begins with "chr") as a comment
    j.command(f'mv {f}.tbi {j.ofile}')
    b.write_output(j.ofile, f'{out_dir}/{fname}.tbi')


if __name__ == "__main__":
    hl.init(log='/Users/nbaya/Downloads/tabix_sumstats.log')
    backend = hb.ServiceBackend(billing_project='ukb_diverse_pops',
                                bucket='ukbb-diverse-temp-30day/nb-batch-tmp')

    b = hb.Batch(
        name='tabix',
        backend=backend,
        default_image='gcr.io/ukbb-diversepops-neale/nbaya_tabix:latest',
        default_storage='100M',  # works with 2G
        default_cpu=1)

    #    sumstats_dir = f'{bucket}/sumstats_flat_files'
    #    sumstats_dir = f'{ldprune_dir}/export_results/update'
    #    sumstats_dir = f'{ldprune_dir}/loo/sumstats/batch1'
    sumstats_dir = f'{ldprune_dir}/variant_qc'
    print(f'\nUsing sumstats from {sumstats_dir}')

    ss_path_list = get_ss_path_list(sumstats_dir=sumstats_dir)
示例#25
0
        "snvs_url":
        "https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh37/whole_genome_SNVs.tsv.gz",
        "indels_url":
        "https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh37/InDels.tsv.gz",
        "version": "v1.6"
    },
    "GRCh38": {
        "snvs_url":
        "https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz",
        "indels_url":
        "https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/gnomad.genomes.r3.0.indel.tsv.gz",
        "version": "v1.6"
    }
}

backend = hb.ServiceBackend(billing_project="hail-datasets-api")
batch = hb.Batch(backend=backend, name=name)
for build in ["GRCh37", "GRCh38"]:
    snvs_url = builds[build]["snvs_url"]
    indels_url = builds[build]["indels_url"]
    version = builds[build]["version"]

    j = batch.new_job(name=f"{name}_{version}_{build}")
    j.image("gcr.io/broad-ctsa/datasets:050521")
    j.command(
        "gcloud -q auth activate-service-account --key-file=/gsa-key/key.json")
    j.command(
        f"wget -c -O - {snvs_url} {indels_url} | "
        "zcat | "
        "grep -v '^#' | "
        """awk -v FS=$'\t' -v OFS=$'\t' 'BEGIN {print "chromosome","position","ref","alt","raw_score","PHRED_score"} {print $0}' | """
示例#26
0
#!/usr/bin/env python3
"""Demonstrates the use of the dataproc module."""

import os
import hailtop.batch as hb
from analysis_runner import dataproc

service_backend = hb.ServiceBackend(
    billing_project=os.getenv('HAIL_BILLING_PROJECT'),
    bucket=os.getenv('HAIL_BUCKET'))

batch = hb.Batch(name='dataproc example', backend=service_backend)

cluster = dataproc.setup_dataproc(
    batch,
    max_age='1h',
    packages=['click', 'selenium'],
    init=['gs://cpg-reference/hail_dataproc/install_common.sh'],
    cluster_name='My Cluster with max-age=1h',
)
cluster.add_job('query.py', job_name='example')

# Don't wait, which avoids resubmissions if this job gets preempted.
batch.run(wait=False)
示例#27
0
        BATCH_DEMO_IMAGE = 'batch-demo:latest'
        backend = hb.LocalBackend()
    else:
        # TODO: Fill in the location of your demo image in GCR
        # Fill this in when running LD-clumping on the service
        # This should look something like gcr.io/atgu-training/batch-demo-<user>:latest
        BATCH_DEMO_IMAGE = ...

        # TODO: Fill in the name of <YOUR_BILLING_PROJECT> and <YOUR_BUCKET>
        # Fill this in when running LD-clumping on the service        
        # The billing project for the workshop is 'atgu-welcome-workshop'.
        # The bucket is the name of the bucket that you configured your service account to have access to. Do not include the gs://
        # In the future, you can use hailctl config to set defaults for these parameters
        # `hailctl config set batch/billing_project my-billing-project`
        # `hailctl config set batch/bucket my-bucket        
        backend = hb.ServiceBackend(billing_project=...,
                                    bucket=...)

    batch = hb.Batch(backend=backend,
                     name='clumping-demo')
    # Define inputs
    vcf = batch.read_input(args.vcf)
    # TODO: We want to read the input file for the phenotypes and make it an InputResourceFile
    # look at the vcf file above for an example of creating an InputResourceFile. The phenotypes
    # file is passed as `args.phenotypes`
    phenotypes = ...

    # QC and compute gwas assoc results
    # TODO: Fill in the argument parameters to the `run_gwas` function
    # This will add a new job to the Batch `batch` that runs a GWAS in Hail
    # and exports the dataset to PLINK format. It also takes as arguments the batch to use, the name
    # of the Docker image to use, a VCF file and a file with the phenotypes.