示例#1
0
文件: helpers.py 项目: tpoterba/hail
def startTestHailContext():
    global _initialized
    if not _initialized:
        url = os.environ.get('HAIL_TEST_SERVICE_BACKEND_URL')
        if url:
            hl.init(master='local[2]', min_block_size=0, quiet=True, _backend=hl.backend.ServiceBackend(url))
        else:
            hl.init(master='local[2]', min_block_size=0, quiet=True)
        _initialized = True
示例#2
0
文件: utils.py 项目: jigold/hail
def initialize(cores, log, n_iter):
    assert not _initialized
    hl.init(master=f'local[{cores}]', quiet=True, log=log)

    global _n_iter
    _n_iter = n_iter

    download_data()

    # make JVM do something to ensure that it is fresh
    hl.utils.range_table(1)._force_count()
示例#3
0
def main():
    parser = argparse.ArgumentParser(description="Driver for hail's gVCF combiner")
    parser.add_argument('--sample-map', help='path to the sample map (must be filesystem local)',
                        required=True)
    parser.add_argument('--sample-file', help='path to a file containing a line separated list'
                                              'of samples to combine (must be filesystem local)')
    parser.add_argument('--tmp-path', help='path to folder for temp output (can be a cloud bucket)',
                        default='/tmp')
    parser.add_argument('--out-file', '-o', help='path to final combiner output', required=True)
    parser.add_argument('--summarize', help='if defined, run summarize, placing the rows table '
                                            'of the output at the argument value')
    parser.add_argument('--json', help='json to use for the import of the gVCFs'
                                       '(must be filesystem local)', required=True)
    args = parser.parse_args()
    samples = build_sample_list(args.sample_map, args.sample_file)
    with open(args.json) as j:
        json = j.read()
    hl.init(default_reference=DEFAULT_REF,
            log='/hail-joint-caller-' + time.strftime('%Y%m%d-%H%M') + '.log')
    run_combiner(samples, json, args.out_file, args.tmp_path, summary_path=args.summarize,
                 overwrite=True)
示例#4
0
def main():
    hl.init()
    data = hl.import_vcf(
        os.path.join(PROJECT_DIR, 'data/chr22_1000_missing.vcf'))
    labels = hl.import_table(os.path.join(PROJECT_DIR,
                                          'data/chr22-labels.csv'),
                             delimiter=',',
                             types={
                                 '22_16050408': 'float64'
                             }).key_by('sample')

    mt = data.annotate_cols(pheno=labels[data.s])
    y = mt.pheno['22_16050408']
    x = mt.GT.n_alt_alleles()
    mt = matrix_table_source('random_forest_model/x', x)
    check_entry_indexed('random_forest_model/x', x)
    mts = mt._select_all(col_exprs=dict(y=y),
                         row_exprs=dict(),
                         col_key=[],
                         entry_exprs=dict(e=x))

    mts.write(os.path.join(
        PROJECT_DIR, 'src/test/data/hail/chr22_1000_missing-22_16050408.vds'),
              overwrite=True)
def query(output):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    gnomad_loadings_path = f'{output}/gnomad_loadings_90k_liftover.ht'

    # liftover and get variants
    ht_gnomad_loadings = hl.read_table(GNOMAD_V2_LOADINGS)
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(
        'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38)
    ht_gnomad_loadings_liftover = ht_gnomad_loadings.annotate(
        liftover=hl.liftover(ht_gnomad_loadings.locus,
                             'GRCh38',
                             include_strand=False),
        old_locus=ht_gnomad_loadings.locus,
    )
    ht_gnomad_loadings_liftover = ht_gnomad_loadings_liftover.key_by(
        locus=ht_gnomad_loadings_liftover.liftover)

    # save gnomad loadings
    ht_gnomad_loadings_liftover.write(gnomad_loadings_path, overwrite=True)
示例#6
0
def main(args):
    hl.init(
        log=f"/variant_filter.log",
        tmp_dir="gs://ccdg-30day-temp/",
        default_reference="GRCh38",
    )
    # TODO: This flag can be removed if this error is no longer relevant: log4j:ERROR Failed to flush writer,
    #  java.io.IOException: No space left on device when trying to write a densified MT from VDS
    hl._set_flags(distributed_scan_comb_op="1")

    if args.update_ccdg_exome_interval_table:
        ccdg_interval_qc_ht(args.pct_samples_defined, overwrite=True)

    determine_pca_variants(
        autosomes_only=not args.not_autosomes_only,
        bi_allelic_only=not args.not_bi_allelic_only,
        adj_only=not args.not_adj_only,
        snv_only=not args.not_snv_only,
        min_gnomad_v3_ac=args.gnomad_v3_ac_filter,
        high_qual_ccdg_exome_interval_only=not args.not_high_qual_ccdg_interval_only,
        high_qual_ukbb_exome_interval_only=not args.not_high_qual_ukbb_interval_only,
        filter_lcr=not args.not_filter_lcr,
        filter_segdup=not args.not_filter_segdup,
        min_joint_af=args.min_af,
        min_joint_callrate=args.min_callrate,
        min_ccdg_exome_callrate=args.ccdg_exome_callrate_cutoff,
        min_ukbb_exome_callrate=args.ukbb_exome_callrate_cutoff,
        ld_pruning=not args.not_ld_pruning,
        ld_pruning_dataset=args.ld_pruning_dataset,
        ld_r2=args.ld_r2,
        read_per_dataset_checkpoint_if_exists=args.read_per_dataset_checkpoint_if_exists,
        read_pre_ld_prune_ht_checkpoint_if_exists=args.read_pre_ld_prune_ht_checkpoint_if_exists,
        read_pre_ld_prune_mt_checkpoint_if_exists=args.read_pre_ld_prune_mt_checkpoint_if_exists,
        overwrite=args.overwrite,
        filter_washu=args.filter_washu,
    )
def query(output):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    loadings_ht = hl.read_table(LOADINGS)
    number_of_pcs = hl.len(loadings_ht.loadings).take(1)[0]
    for i in range(0, (number_of_pcs)):
        pc = i + 1
        p = manhattan_loadings(
            pvals=hl.abs(loadings_ht.loadings[i]),
            locus=loadings_ht.locus,
            title='Loadings of PC ' + str(pc),
            collect_all=True,
        )
        plot_filename = f'{output}/loadings_manhattan_plot_pc' + str(
            pc) + '.png'
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(p).save(f, format='PNG')
        plot_filename_html = 'loadings_pc' + str(pc) + '.html'
        output_file(plot_filename_html)
        save(p)
        subprocess.run(['gsutil', 'cp', plot_filename_html, output],
                       check=False)
示例#8
0
def main():
    
    use_tabix = True
    
    hl.init(log='/Users/nbaya/Downloads/get_chr_pos.log')
    backend = hb.ServiceBackend(billing_project='ukb_diverse_pops',
                                bucket='ukbb-diverse-temp-30day/nb-batch-tmp')
    
    b = hb.Batch(name='get_chr_pos', backend=backend,
                 default_image='gcr.io/ukbb-diversepops-neale/nbaya_tabix:latest',
                 default_storage='2G', default_cpu=1)

    
    paths = get_paths()
    
    for path in paths:
        print(path)
        annotate_chr_pos(b=b,
                         path=path,
                         use_tabix=use_tabix)
    
    b.run(open=True)
    
    backend.close()
示例#9
0
import hail as hl
import logging
import os
import pandas as pd
import re
import subprocess

logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

hl.init(log="/dev/null")

#%%
ht = hl.read_table(
    "gs://gnomad/metadata/genomes_v3.1/gnomad_v3.1_sample_qc_metadata.ht")
ht = ht.filter(ht.release)
release_samples = ht.s.collect()

#%%

sample_ids_gnomad_v3 = hl.hadoop_open(
    "gs://gnomad-bw2/sample_ids_gnomad_v3__20210131.txt").read().split("\n")
release_sample_ids_gnomad_v3 = list(
    set(sample_ids_gnomad_v3) & set(release_samples))  # 39285 samples

sample_ids_gnomad_v3_1 = hl.hadoop_open(
    "gs://gnomad-bw2/sample_ids_gnomad_v3_1__20210131.txt").read().split("\n")
release_sample_ids_gnomad_v3_1 = list(
    set(sample_ids_gnomad_v3_1) & set(release_samples))  # 3526 samples
示例#10
0
文件: apiserver.py 项目: jigold/hail
import functools as ft
import json
import os
import uvloop
from aiohttp import web

import jwt
import hail as hl
from hail.utils import FatalError
from hail.utils.java import Env, info, scala_object
import hailjwt as hj

uvloop.install()

master = os.environ.get('HAIL_APISERVER_SPARK_MASTER')
hl.init(master=master, min_block_size=0)

app = web.Application()
routes = web.RouteTableDef()


with open(os.environ.get('HAIL_JWT_SECRET_KEY_FILE') or '/jwt-secret/secret-key') as f:
    jwtclient = hj.JWTClient(f.read())


def authenticated_users_only(fun):
    @ft.wraps(fun)
    def wrapped(request, *args, **kwargs):
        encoded_token = request.cookies.get('user')
        if encoded_token is not None:
            try:
示例#11
0
def main(args):
    # Init Hail
    hl.init(default_reference=args.default_ref_genome)

    # Import VEPed VCF file as MatrixTable and get VCF file meta-data
    # vcf_path = args.vcf_vep_path
    mt = hl.import_vcf(path=get_vep_vqsr_vcf_path(), force_bgz=args.force_bgz)

    # getting annotated VEP fields names from VCF-header
    vep_fields = get_vep_fields(vcf_path=get_vep_vqsr_vcf_path(),
                                vep_csq_field=args.csq_field)

    if args.split_multi_allelic:
        # split multi-allelic variants
        mt = hl.split_multi_hts(mt)

        # split/annotate fields in the info field (use allele index )
        mt = mt.annotate_rows(info=mt.info.annotate(
            **{field: mt.info[field][mt.a_index - 1]
               for field in INFO_FIELDS}))

    # parse/annotate the CSQ field in a different structure
    tb_csq = mt.rows()
    tb_csq = (tb_csq.annotate(csq_raw=tb_csq.info[args.csq_field]))

    # Convert/annotate all transcripts per variants with a structure of type array<dict<str, str>>.
    # The transcript(s) are represented as a dict<k,v>, where keys are the field names extracted from the VCF header and
    # the values are the current annotated values in the CSQ field.
    tb_csq = (tb_csq.annotate(csq_raw=tb_csq.csq_raw.map(
        lambda x: hl.dict(hl.zip(vep_fields, x.split('[|]'))))))

    # Keep transcript(s) matching with the allele index (only used if variant were split with split_multi_hts)
    # It requires having the flag "ALLELE_NUM" annotated by VEP
    # Apply only were the alleles were split.
    # TODO: Handle exception when the flag "ALLELE_NUM" is not present
    if all(
        [x in list(tb_csq._fields.keys()) for x in ['was_split', 'a_index']]):
        tb_csq = (tb_csq.annotate(csq_raw=hl.cond(
            tb_csq.was_split,
            tb_csq.csq_raw.filter(lambda x: (hl.int(x["ALLELE_NUM"]) == tb_csq.
                                             a_index)), tb_csq.csq_raw)))

    # select and annotate one transcript per variant based on pre-defined rules
    tb_csq = pick_transcript(
        ht=tb_csq,
        csq_array='csq_raw',
    )

    # Expand selected transcript (dict) annotations adding independent fields.
    tb_csq = annotate_from_dict(ht=tb_csq, dict_field='tx', output_filed='vep')

    # Parse the "Consequence" field. Keep only the more severe consequence.
    # Avoid the notation "consequence_1&consequence_2"
    tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate(
        Consequence=tb_csq.vep.Consequence.split('&')[0])))

    # Parse the protein DOMAIN field
    if 'DOMAINS' in vep_fields:
        tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate(
            DOMAINS=vep_protein_domain_ann_expr(tb_csq.vep['DOMAINS']))))

    # drop redundant/temp fields
    tb_csq = (tb_csq.drop('csq_raw', 'tx').repartition(500))

    # print fields overview
    tb_csq.describe()

    # write table as HailTable to disk
    # (tb_csq
    # .write(output=args.tb_output_path,
    #        overwrite=args.overwrite)
    # )

    output_path = get_variant_qc_ht_path(part='vep_vqsr',
                                         split=args.split_multi_allelic)
    tb_csq = (tb_csq.checkpoint(output=output_path, overwrite=args.overwrite))

    if args.write_to_file:
        # write table to disk as a BGZ-compressed TSV file
        (tb_csq.export(f'{output_path}.tsv.bgz'))

    # Stop Hail
    hl.stop()
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    scores = hl.read_table(SCORES)
    scores = scores.annotate(cohort_sample_codes=hl.if_else(
        scores.s.contains('snp_chip'), 'snp_chip', 'tob_wgs'))
    labels = scores.cohort_sample_codes
    hover_fields = dict([('s', scores.s)])

    # get percent variance explained
    eigenvalues = hl.import_table(EIGENVALUES)
    eigenvalues = eigenvalues.to_pandas()
    eigenvalues.columns = ['eigenvalue']
    eigenvalues = pd.to_numeric(eigenvalues.eigenvalue)
    variance = eigenvalues.divide(float(eigenvalues.sum())) * 100
    variance = variance.round(2)

    # Get number of PCs
    number_of_pcs = len(eigenvalues)

    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        print(f'PC{pc1 + 1} vs PC{pc2 + 1}')
        p = hl.plot.scatter(
            scores.scores[pc1],
            scores.scores[pc2],
            label=labels,
            title='TOB-WGS + TOB SNP Chip',
            xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)',
            ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)',
            hover_fields=hover_fields,
        )
        plot_filename = output_path('pc' + str(pc2) + '.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(p).save(f, format='PNG')
        html = file_html(p, CDN, 'my plot')
        plot_filename_html = output_path(f'pc{pc2}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)

    # Get partner sample information
    sample_names = scores.s.collect()

    def sample_type(sample_name):
        if sample_name.endswith('snp_chip'):
            partner_name = re.sub('_snp_chip', '', sample_name)
            tech = 'snp'
        else:
            partner_name = sample_name + '_snp_chip'
            tech = 'wgs'

        if partner_name in sample_names:
            prefix = 'dual_'
        else:
            prefix = ''

        return prefix + tech

    # save as html
    labels = list(map(sample_type, sample_names))
    html = pd.DataFrame({
        'sample_name': sample_names,
        'sample_tech': labels
    }).to_html()
    plot_filename_html = output_path(f'sample_technology.html', 'web')
    with hl.hadoop_open(plot_filename_html, 'w') as f:
        f.write(html)

    # plot
    cohort_sample_codes = list(set(labels))
    tooltips = [('labels', '@label'), ('samples', '@samples')]
    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        plot = figure(
            title='Reprocessed Sample Projection',
            x_axis_label='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) +
            '%)',
            y_axis_label='PC' + str(pc2 + 1) + ' (' + str(variance[pc1]) +
            '%)',
            tooltips=tooltips,
        )
        source = ColumnDataSource(
            dict(
                x=scores.scores[pc1].collect(),
                y=scores.scores[pc2].collect(),
                label=labels,
                samples=sample_names,
            ))
        plot.circle(
            'x',
            'y',
            alpha=0.5,
            source=source,
            size=8,
            color=factor_cmap('label', Dark2[len(cohort_sample_codes)],
                              cohort_sample_codes),
            legend_group='label',
        )
        plot.add_layout(plot.legend[0], 'left')
        plot_filename = output_path('technology_type_pc' + str(pc2) + '.png',
                                    'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'technology_type_pc{pc2}.html',
                                         'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)
#!/usr/bin/env python3

import hail as hl
hl.init(tmp_dir='/net/scratch/people/plggosborcz', spark_conf={'spark.driver.memory': '90G', 'spark.executor.memory': '90G'}, default_reference='GRCh38') 

europeans = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/1kg/europeans', delimiter = "\t", no_header = True)
to_keep = europeans['f0'].collect()



controls = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/1kg/gnomad.genomes.v3.1.2.hgdp_1kg_subset_sparse.mt')
controls = controls.filter_cols(hl.literal(to_keep).contains(controls.s))
controls.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/1kg/1kg-europeans-sparse.ht')
示例#14
0
def main(args):
    hl.init(default_reference="GRCh38", log="/variant_histograms.log")

    logger.info("Loading ANNOTATIONS_HISTS dictionary...")
    if not file_exists(annotation_hists_path()):
        raise DataException(
            "Annotation hists JSON file not found. Need to create this JSON before running script!"
        )

    with hl.hadoop_open(annotation_hists_path()) as a:
        ANNOTATIONS_HISTS = json.loads(a.read())

    # NOTE: histogram aggregations on these metrics are done on the entire callset (not just PASS variants), on raw data
    ht = hl.read_table(release_ht_path(public=False))
    ht = ht.select(freq=ht.freq, info=ht.info.select(*ANNOTATIONS_HISTS))

    inbreeding_bin_ranges = ANNOTATIONS_HISTS["InbreedingCoeff"]

    # Remove InbreedingCoeff from ANNOTATIONS_HISTS. It requires different ranges by allele frequency and needs to be
    # handled differently. It is stored as a dictionary in annotation_hists_path
    ANNOTATIONS_HISTS.remove("InbreedingCoeff")

    logger.info("Getting info annotation histograms...")
    hist_ranges_expr = get_annotations_hists(ht, ANNOTATIONS_HISTS, LOG10_ANNOTATIONS)

    # Evaluate minimum and maximum values for each metric of interest to help determine the bounds of the hists
    # NOTE: Run this first, then update values in annotation_hists_path JSON as necessary
    if args.determine_bounds:
        logger.info(
            "Evaluating minimum and maximum values for each metric of interest. Maximum values capped at 1e10."
        )
        minmax_dict = {}
        for metric in ANNOTATIONS_HISTS:
            minmax_dict[metric] = hl.struct(
                min=hl.agg.min(ht.info[metric]),
                max=hl.if_else(
                    hl.agg.max(ht.info[metric]) < 1e10,
                    hl.agg.max(ht.info[metric]),
                    1e10,
                ),
            )
        minmax = ht.aggregate(hl.struct(**minmax_dict))
        logger.info(f"Metrics bounds: {minmax}")
    else:
        logger.info(
            "Aggregating hists over ranges defined in the annotation_hists_path JSON file. --determine_bounds can "
            "be used to help define these ranges..."
        )
        hists = ht.aggregate(
            hl.array(
                [
                    hist_expr.annotate(metric=hist_metric)
                    for hist_metric, hist_expr in hist_ranges_expr.items()
                ]
            )
            .extend(
                hl.array(
                    hl.agg.group_by(
                        create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF),
                        hl.agg.hist(
                            hl.log10(ht.info.QUALapprox),
                            *ANNOTATIONS_HISTS["QUALapprox"],
                        ),
                    )
                ).map(lambda x: x[1].annotate(metric="QUALapprox-" + x[0]))
            )
            .extend(
                hl.array(
                    hl.agg.group_by(
                        create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF),
                        hl.agg.hist(
                            hl.log10(ht.info.AS_QUALapprox),
                            *ANNOTATIONS_HISTS["AS_QUALapprox"],
                        ),
                    )
                ).map(lambda x: x[1].annotate(metric="AS_QUALapprox-" + x[0]))
            ),
            _localize=False,
        )

        # Defining hist range and bins for allele frequency groups because they needed different ranges
        ht = ht.annotate(af_bin=create_frequency_bins_expr_inbreeding(AF=ht.freq[1].AF))
        inbreeding_hists = [
            ht.aggregate(
                hl.agg.filter(
                    ht.af_bin == x,
                    hl.agg.hist(ht.info.InbreedingCoeff, *inbreeding_bin_ranges[x],),
                )
            ).annotate(metric="InbreedingCoeff" + "-" + x)
            for x in inbreeding_bin_ranges
        ]

        hists = hl.eval(hl.json(hists))
        inbreeding_hists = hl.eval(hl.json(inbreeding_hists))

        # Note: The following removes '}' from the JSON stored in hists and '{' from the JSON stored in
        # inbreeding_hists then joins them together to be written out as a single JSON
        hists = hists[:-1] + "," + inbreeding_hists[1:]

        logger.info("Writing output")
        with hl.hadoop_open(qual_hists_json_path(), "w") as f:
            f.write(hists)
示例#15
0
        #ht_out = ht_out.annotate(**covs[ht_out.key])
        ht_comb = ht_out.select(*p_max.keys(),
                                age=ht_out.phenotypes.age,
                                sex=ht_out.phenotypes.sex,
                                pheno=ht_out.phenotypes[pheno])

        output_location = args.ss_clump_prefix + pheno + '_apcdr_PRS'
        #ht_comb.describe()
        #ht_comb.write(output_location + '.ht', overwrite=args.overwrite)
        #ht_comb = hl.read_table(output_location + '.ht')
        ht_comb.export(output_location + '.txt.bgz')

    end = time.time()
    print("Success! Job was completed in %s" %
          time.strftime("%H:%M:%S", time.gmtime(end - start)))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--ss_clump_prefix',
        default='gs://apcdr/prs_sumstats_clump/apcdr_ukb_10k_eur_holdout_meta/'
    )
    parser.add_argument('--ss_suffix', default='.meta.bgz')
    parser.add_argument('--chr_pos_ref_alt_p_beta',
                        default='CHR,POS,A1,A2,P,BETA')
    parser.add_argument('--overwrite', action='store_true')
    args = parser.parse_args()

    hl.init(log='/prs.log')
    main(args)
    pca_mt.write(f"{args.output_dir}/mt_pca.mt", overwrite=True)
    p = hl.plot.scatter(pca_mt.scores[0],
                        pca_mt.scores[1],
                        title='PCA',
                        xlabel='PC1',
                        ylabel='PC2')
    output_file(f"{args.plot_dir}/pca.html")
    save(p)


if __name__ == "__main__":
    # need to create spark cluster first before intiialising hail
    sc = pyspark.SparkContext()
    # Define the hail persistent storage directory
    hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38")
    # s3 credentials required for user to access the datasets in farm flexible compute s3 environment
    # you may use your own here from your .s3fg file in your home directory
    hadoop_config = sc._jsc.hadoopConfiguration()

    hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"])
    hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"])

    #####################################################################
    ###################### INPUT DATA  ##############################
    #####################################################################
    parser = argparse.ArgumentParser()
    # Read the matrixtable, chrX and chrY should be included
    input_params = parser.add_argument_group("Input parameters")
    input_params.add_argument(
        "--matrixtable",
示例#17
0
def hailthread(cond1, q, cond2, qcm, inputDir, outputDir, qaws_size):

    #Load id_conversion file
    #table_idconv=hl.import_table('id_conversion')

    #Load markers files
    #table_makers_pos=hl.import_table('800k_to_extract_indexed2.txt',delimiter=':',no_header=True,impute=True)
    #table_markers_all=hl.import_table('800k_to_extract_indexed_alleles_gt2.txt',delimiter=':',no_header=True,impute=True)

    #cut -f 1 -d',' 800k_to_extract_indexed2.txt > interval_table
    #awk -F':' '{print $1"\t"$2"\t"$2}' interval_table > interval_table2

    hl.init()
    cond1.acquire()
    while not an_item_is_available(q):
        #print("Thread hail to sleep")
        #time.sleep(300)
        print("Thread hail to wait")

        cond1.wait()

    file = get_an_available_item(q)
    print("Thread hail get item " + file)
    qaws_size = qaws_size - 1
    cond1.release()

    interval_table = hl.import_locus_intervals('interval_table2',
                                               reference_genome='GRCh38')

    while file != "END":
        fileParts = file.split("/")[-1]
        fileName = fileParts.replace(".vcf.gz", "").replace(".gvcf.gz", "")
        chrName = fileName.split("_")[-3]
        #myFNAL=fileName.split("\\.")
        #myTempId=myFNAL[0]
        #Load gVCF file
        #data=hl.import_vcf("/mnt/vol1/java/gel_test.vcf",force_bgz=True,reference_genome='GRCh38')
        #data=hl.import_vcf("/mnt/vol1/java/gel_mainProgramme_aggV2_chr10_129040437_131178399.vcf.gz",force_bgz=True,reference_genome='GRCh38')
        try:

            #Extract INFO fields

            data = hl.import_vcf(inputDir + "/" + fileParts,
                                 force_bgz=True,
                                 reference_genome='GRCh38',
                                 drop_samples=True)
            #Filters PASS
            if chrName != "chrY":
                data = data.filter_rows(data.filters.size() > 0, keep=False)
            #Multiallelic
            data = hl.split_multi_hts(data)
            #Join with markers
            data_filtered = data.filter_rows(
                hl.is_defined(interval_table[data.locus]))

            data_sr = data_filtered.select_rows(
                data_filtered.info.medianDepthAll,
                data_filtered.info.medianDepthNonMiss,
                data_filtered.info.medianGQ, data_filtered.info.missingness,
                data_filtered.info.completeGTRatio, data_filtered.info.ABratio,
                data_filtered.info.MendelSite, data_filtered.info.AN,
                data_filtered.info.AC, data_filtered.info.AC_Hom,
                data_filtered.info.AC_Het)

            ht = data_sr.make_table()
            ht.export(outputDir + "/" + fileName + "_INFO.tsv")
            os.system("sed -i 's/\[//g' " + outputDir + "/" + fileName +
                      "_INFO.tsv")
            os.system("sed -i 's/]//g' " + outputDir + "/" + fileName +
                      "_INFO.tsv")
            os.system("cat " + outputDir + "/" + fileName +
                      "_INFO.tsv | grep -v locus " + " >> " + outputDir +
                      "/INFO_" + chrName)
            os.system("rm " + inputDir + "/" + fileParts)

            cond2.acquire()
            print("Thread hail make item available " + fileName)
            make_an_item_available(qcm, file)
            cond2.notify_all()
            cond2.release()
        except FatalError as e:
            print("Exception2 in file:" + file)
            os.system("rm " + inputDir + "/" + fileParts)

        except AssertionError as e:
            print("Exception3 in file:" + file)
            os.system("rm " + inputDir + "/" + fileParts)

        except Exception as e:
            print("Exception in file:" + file)
            os.system("rm " + inputDir + "/" + fileParts)

            #raise Exception
        cond1.acquire()
        while not an_item_is_available(q):
            #print("Thread hail to sleep")
            #time.sleep(300)
            print("Thread hail to wait")
            cond1.wait()

        file = get_an_available_item(q)
        print("Thread hail get item " + file)
        qaws_size = qaws_size - 1
        cond1.release()
    time.sleep(300)
    cond2.acquire()
    print("Thread hail make END available")
    make_an_item_available(qcm, "END")
    cond2.notify_all()
    cond2.release()
示例#18
0
文件: helpers.py 项目: danking/hail
def startTestHailContext():
    global _initialized
    if not _initialized:
        hail.init(master='local[2]', min_block_size=0, quiet=True)
        _initialized = True
示例#19
0
import hail as hl

from hail.utils.java import Env, info

import logging
import flask

hl.init()

app = flask.Flask('hail-apiserver')

@app.route('/execute', methods=['POST'])
def execute():
    code = flask.request.json
    
    info(f'execute: {code}')
    
    jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {})
    
    typ = hl.HailType._from_java(jir.typ())
    value = Env.hail().expr.ir.Interpret.interpretPyIR(code, {}, {})

    result = {
        'type': str(typ),
        'value': value
    }
    
    info(f'result: {result}')
    
    return flask.jsonify(result)
示例#20
0
import json
import hail as hl

gvcfs = ['gs://hail-ci/gvcfs/HG00096.g.vcf.gz',
         'gs://hail-ci/gvcfs/HG00268.g.vcf.gz']
hl.init(default_reference='GRCh38')
parts = [
    {'start': {'locus': {'contig': 'chr20', 'position': 17821257}},
     'end': {'locus': {'contig': 'chr20', 'position': 18708366}},
     'includeStart': True,
     'includeEnd': True},
    {'start': {'locus': {'contig': 'chr20', 'position': 18708367}},
     'end': {'locus': {'contig': 'chr20', 'position': 19776611}},
     'includeStart': True,
     'includeEnd': True},
    {'start': {'locus': {'contig': 'chr20', 'position': 19776612}},
     'end': {'locus': {'contig': 'chr20', 'position': 21144633}},
     'includeStart': True,
     'includeEnd': True},
]
parts_str = json.dumps(parts)
vcfs = hl.import_vcfs(gvcfs, parts_str)
示例#21
0
文件: utils.py 项目: saponas/hail
 def handler(signum, frame):
     global _timeout_state
     _timeout_state = True
     hl.stop()
     hl.init(**_init_args)
     raise BenchmarkTimeoutError()
示例#22
0
def main():

    # # Args (local)
    # chrom = 11
    # chain_file = '/Users/em21/Projects/ot_genetics/genetics-sumstats_data/extras/prepare_uk_biobank_gwas_catalog/sitelist/input_data/grch37_to_grch38.over.chain.gz'
    # in_bgen = 'example_data/ukb_imp_chr{chrom}_v3.example.bgen'
    # in_sample = 'output/ukb_10k_downsampled.sample'
    # to_keep_list = 'output/ukb_10k_downsampled.sample_list.tsv'
    # out_plink = 'output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k'
    # cores = 1 # Use "*" for all
    # maf_threshold = 0.001

    # Args (server)
    chrom = sys.argv[1]
    chain_file = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/grch37_to_grch38.over.chain.gz'
    in_bgen = '/nfs/users/nfs_e/em21/otcoregen/uk_biobank_data/data/genetics/imputation/ukb_imp_chr{chrom}_v3.bgen'
    in_sample = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample'
    to_keep_list = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample_list.tsv'
    out_plink = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k'
    cores = sys.argv[2]  # Use "*" for all
    maf_threshold = 0.001

    # Set the maximum number of cores
    hl.init(master="local[{}]".format(cores))

    # Prepare liftover
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(chain_file, rg38)

    # Create my own rg38 with altered names
    rg38_custom_contigs = [
        contig.replace('chr', '') for contig in rg38.contigs
    ]
    rg38_custom_lens = {}
    for contig in rg38.lengths:
        rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig]
    rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs,
                                     rg38_custom_lens)

    print('Processing chromosome {0}'.format(chrom))

    # Index bgen if not existing
    if not hl.hadoop_exists(in_bgen.format(chrom=chrom) + '.idx2'):
        hl.index_bgen(in_bgen.format(chrom=chrom),
                      contig_recoding={
                          "01": "1",
                          "02": "2",
                          "03": "3",
                          "04": "4",
                          "05": "5",
                          "06": "6",
                          "07": "7",
                          "08": "8",
                          "09": "9"
                      },
                      reference_genome='GRCh37')

    # Load bgen
    mt = hl.import_bgen(in_bgen.format(chrom=chrom),
                        entry_fields=['GT'],
                        sample_file=in_sample)

    # Load list samples to keep
    samples_to_keep = hl.import_table(to_keep_list,
                                      no_header=True,
                                      impute=False,
                                      types={
                                          'f0': hl.tstr
                                      }).key_by('f0')

    # Downsample to required subset of samples
    mt = mt.filter_cols(hl.is_defined(samples_to_keep[mt.s]))

    # Re-call to remove phasing (required for plink output)
    # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False))

    # Filter on MAF
    mt = hl.variant_qc(mt)
    mt = mt.annotate_rows(variant_qc=mt.variant_qc.annotate(
        MAF=hl.min(mt.variant_qc.AF)))
    mt = mt.filter_rows(mt.variant_qc.MAF >= maf_threshold)

    # Liftover
    mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38'))

    # Strip chr from contig name (causes problems with GCTA)
    mt = mt.annotate_rows(
        contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', ''))

    # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom)
    mt = mt.key_rows_by()
    mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38,
                                         mt.locus_GRCh38.position,
                                         reference_genome=rg38_custom))
    mt = mt.key_rows_by(mt.locus, mt.alleles)

    # Remove rows with missing locus (after liftover)
    mt = mt.filter_rows(hl.is_defined(mt.locus))

    # Write plink format
    hl.export_plink(dataset=mt, output=out_plink.format(chrom=chrom))

    return 0
示例#23
0
 def test_init_hail_context_twice(self):
     hl.init(hl.spark_context(), idempotent=True) # Should be no error
# coding: utf-8
import hail as hl
import hail.expr.aggregators as agg
import numpy as np
import matplotlib.pyplot as plt
from math import log, isnan
from pprint import pprint
import time
hl.init()  # Initialize Hail and Spark.

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# ## key step
# ### 1. extract pca info, transform it to dataframe
# ### 2. build linear regression model, predict y and get y residuals
# ### 3. store y residuals in hail MatrixTable
# ### 4. run gwas and compare time

# Import a PLINK dataset (BED, BIM, FAM) as a MatrixTable
vds = hl.import_plink('gs://ukb_testdata/maf_0.01_10.bed',
                      'gs://ukb_testdata/maf_0.01_10.bim',
                      'gs://ukb_testdata/maf_0.01_10.fam')

# Import delimited text file (text table) as Table
# import phenotype
table = (hl.import_table('gs://ukb_testdata/sleep_duration.tsv',
                         delimiter='\t',
示例#25
0
def main(args):
    hl.init(log="/variant_qc_evaluation.log")

    if args.create_bin_ht:
        create_bin_ht(
            args.model_id,
            args.n_bins,
        ).write(
            get_score_bins(args.model_id, aggregated=False).path,
            overwrite=args.overwrite,
        )

    if args.run_sanity_checks:
        ht = get_score_bins(args.model_id, aggregated=False).ht()
        logger.info("Running sanity checks...")
        print(
            ht.aggregate(
                hl.struct(
                    was_biallelic=hl.agg.counter(~ht.was_split),
                    has_biallelic_rank=hl.agg.counter(
                        hl.is_defined(ht.biallelic_bin)),
                    was_singleton=hl.agg.counter(ht.singleton),
                    has_singleton_rank=hl.agg.counter(
                        hl.is_defined(ht.singleton_bin)),
                    was_biallelic_singleton=hl.agg.counter(ht.singleton
                                                           & ~ht.was_split),
                    has_biallelic_singleton_rank=hl.agg.counter(
                        hl.is_defined(ht.biallelic_singleton_bin)),
                )))

    if args.create_aggregated_bin_ht:
        logger.warning(
            "Use only workers, it typically crashes with preemptibles")
        create_aggregated_bin_ht(args.model_id).write(
            get_score_bins(args.model_id, aggregated=True).path,
            overwrite=args.overwrite,
        )

    if args.extract_truth_samples:
        logger.info(f"Extracting truth samples from MT...")
        mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True,
                              remove_hard_filtered_samples=False)

        mt = mt.filter_cols(
            hl.literal([v["s"]
                        for k, v in TRUTH_SAMPLES.items()]).contains(mt.s))
        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

        # Checkpoint to prevent needing to go through the large table a second time
        mt = mt.checkpoint(
            get_checkpoint_path("truth_samples", mt=True),
            overwrite=args.overwrite,
        )

        for truth_sample in TRUTH_SAMPLES:
            truth_sample_mt = mt.filter_cols(
                mt.s == TRUTH_SAMPLES[truth_sample]["s"])
            # Filter to variants in truth data
            truth_sample_mt = truth_sample_mt.filter_rows(
                hl.agg.any(truth_sample_mt.GT.is_non_ref()))
            truth_sample_mt.naive_coalesce(args.n_partitions).write(
                get_callset_truth_data(truth_sample).path,
                overwrite=args.overwrite,
            )

    if args.merge_with_truth_data:
        for truth_sample in TRUTH_SAMPLES:
            logger.info(
                f"Creating a merged table with callset truth sample and truth data for {truth_sample}..."
            )

            # Load truth data
            mt = get_callset_truth_data(truth_sample).mt()
            truth_hc_intervals = TRUTH_SAMPLES[truth_sample][
                "hc_intervals"].ht()
            truth_mt = TRUTH_SAMPLES[truth_sample]["truth_mt"].mt()
            truth_mt = truth_mt.key_cols_by(
                s=hl.str(TRUTH_SAMPLES[truth_sample]["s"]))

            # Remove low quality sites
            info_ht = get_info(split=True).ht()
            mt = mt.filter_rows(~info_ht[mt.row_key].AS_lowqual)

            ht = create_truth_sample_ht(mt, truth_mt, truth_hc_intervals)
            ht.write(
                get_callset_truth_data(truth_sample, mt=False).path,
                overwrite=args.overwrite,
            )

    if args.bin_truth_sample_concordance:
        for truth_sample in TRUTH_SAMPLES:
            logger.info(
                f"Creating binned concordance table for {truth_sample} for model {args.model_id}"
            )
            ht = get_callset_truth_data(truth_sample, mt=False).ht()

            info_ht = get_info(split=True).ht()
            ht = ht.filter(
                ~info_ht[ht.key].AS_lowqual
                & ~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus]))

            logger.info("Filtering out low confidence regions and segdups...")
            ht = filter_low_conf_regions(
                ht,
                filter_lcr=True,
                # TODO: Uncomment when we have decoy path
                filter_decoy=False,  # True,
                filter_segdup=True,
            )

            logger.info(
                "Loading HT containing RF or VQSR scores annotated with a bin based on the rank of score..."
            )
            metric_ht = get_score_bins(args.model_id, aggregated=False).ht()
            ht = ht.filter(hl.is_defined(metric_ht[ht.key]))

            ht = ht.annotate(score=metric_ht[ht.key].score)

            ht = compute_binned_truth_sample_concordance(
                ht, metric_ht, args.n_bins)
            ht.write(
                get_binned_concordance(args.model_id, truth_sample).path,
                overwrite=args.overwrite,
            )
示例#26
0
    get_expr_for_variant_loftee_flag_flag,
    get_expr_for_genes_with_loftee_flag_flag,
    get_expr_for_ref_allele,
    get_expr_for_start_pos,
    get_expr_for_variant_id,
    get_expr_for_vep_sorted_transcript_consequences_array,
    get_expr_for_xpos,
)

p = argparse.ArgumentParser()
p.add_argument("--input-url", help="URL of gnomAD 2.1 flattened Hail table to export", required=True)
p.add_argument("--output-url", help="URL to write shaped Hail table to", required=True)
p.add_argument("--subset", help="Filter variants to this chrom:start-end range")
args = p.parse_args()

hl.init(log="/tmp/hail.log")

ds = hl.read_table(args.input_url)

# The globals in the flattened Hail table cause a serialization error during export to ES.
ds = ds.select_globals()

if args.subset:
    subset_interval = hl.parse_locus_interval(args.subset)
    ds = ds.filter(subset_interval.contains(ds.locus))

####################
# Top level fields #
####################

# These fields remain at the top level
示例#27
0
def main(args):
    hl.init(log='/frequency_data_generation.log', default_reference='GRCh38')

    logger.info("Reading sparse MT and metadata table...")
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
    meta_ht = meta.ht().select('pop', 'sex', 'project_id', 'release', 'sample_filters')

    if args.test:
        logger.info("Filtering to chr20:1-1000000")
        mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr20:1-1000000')])

    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

    logger.info("Annotating sparse MT with metadata...")
    mt = mt.annotate_cols(meta=meta_ht[mt.s])
    mt = mt.filter_cols(mt.meta.release)
    samples = mt.count_cols()
    logger.info(f"Running frequency table prep and generation pipeline on {samples} samples")

    logger.info("Computing adj and sex adjusted genotypes.")
    mt = mt.annotate_entries(
        GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex),
        adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD)
    )

    logger.info("Densify-ing...")
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) > 1)

    logger.info("Setting het genotypes at sites with >1% AF (using v3.0 frequencies) and > 0.9 AB to homalt...")
    # hotfix for depletion of homozygous alternate genotypes
    # Using v3.0 AF to avoid an extra frequency calculation
    # TODO: Using previous callset AF works for small incremental changes to a callset, but we need to revisit for large increments
    freq_ht = freq.versions["3"].ht()
    freq_ht = freq_ht.select(AF=freq_ht.freq[0].AF)

    mt = mt.annotate_entries(
        GT=hl.cond(
            (freq_ht[mt.row_key].AF > 0.01)
            & mt.GT.is_het()
            & (mt.AD[1] / mt.DP > 0.9),
            hl.call(1, 1),
            mt.GT,
        )
    )

    logger.info("Calculating InbreedingCoefficient...")
    # NOTE: This is not the ideal location to calculate this, but added here to avoid another densify
    mt = mt.annotate_rows(InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

    logger.info("Generating frequency data...")
    mt = annotate_freq(
        mt,
        sex_expr=mt.meta.sex,
        pop_expr=mt.meta.pop
    )

    # Select freq, FAF and popmax
    faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX)
    mt = mt.select_rows(
        'InbreedingCoeff',
        'freq',
        faf=faf,
        popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX)
    )
    mt = mt.annotate_globals(faf_meta=faf_meta)

    # Annotate quality metrics histograms, as these also require densifying
    mt = mt.annotate_rows(
        **qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD)
    )

    logger.info("Writing out frequency data...")
    if args.test:
        mt.rows().write("gs://gnomad-tmp/gnomad_freq/chr20_1_1000000_freq.ht", overwrite=True)
    else:
        mt.rows().write(freq.path, overwrite=args.overwrite)
示例#28
0
#conda activate hail
#cd /Users/mzekavat/opt/anaconda3/envs/hail
#hailctl dataproc start mz02 --master-machine-type n1-highmem-16 --worker-machine-type n1-highmem-16 --worker-boot-disk-size 200 --num-workers 3 --num-preemptible-workers 3 --master-boot-disk-size 100 --region us-east1 --zone us-east1-d --requester-pays-allow-all --properties "spark:spark.driver.memory=90G,spark:spark.driver.maxResultSize=50G,spark:spark.kryoserializer.buffer.max=1G,spark:spark.task.maxFailures=20,spark:spark.driver.extraJavaOptions=-Xss4M,spark:spark.executor.extraJavaOptions=-Xss4M,spark:spark.speculation=true"
#hailctl dataproc connect mz02 notebook --zone us-east1-d --region us-east1
#hailctl dataproc submit --zone us-east1-d --region us-east1 mz02 ~/Documents/Broad_2015_17/Python_Scripts_Hail/CHIP/Merge_SomaticVCFS_15000_30000.py
import hail as hl
import hail.expr.aggregators as agg
hl.init(default_reference="GRCh38")
import numpy as np
import pandas as pd
from collections import Counter
from math import log, isnan
from pprint import pprint
import time
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

recoding_dict = {f"{i + 1}": f"chr{i + 1}" for i in range(22)}
recoding_dict['X'] = 'chrX'
recoding_dict['Y'] = 'chrY'

files = hl.import_table('gs://maryam_lipids/UKBB_CHIP/filenames.txt',
                        impute=True,
                        no_header=True)
files_list = [row['f0'] for row in files.select(files.f0).collect()]

for num in range(1, 10000):
    print(num)
    filenamev2 = files_list[num].strip()
    mt = hl.import_vcf(filenamev2,
示例#29
0
def main(args):
    hl.init(log="/variant_qc_random_forest.log")

    if args.list_rf_runs:
        logger.info(f"RF runs:")
        pretty_print_runs(get_rf_runs(rf_run_path()))

    if args.annotate_for_rf:
        ht = create_rf_ht(
            impute_features=args.impute_features,
            adj=args.adj,
            n_partitions=args.n_partitions,
            checkpoint_path=get_checkpoint_path("rf_annotation"),
        )
        ht.write(
            get_rf_annotations(args.adj).path, overwrite=args.overwrite,
        )
        logger.info(f"Completed annotation wrangling for random forests model training")

    if args.train_rf:
        model_id = f"rf_{str(uuid.uuid4())[:8]}"
        rf_runs = get_rf_runs(rf_run_path())
        while model_id in rf_runs:
            model_id = f"rf_{str(uuid.uuid4())[:8]}"

        ht, rf_model = train_rf(
            get_rf_annotations(args.adj).ht(),
            fp_to_tp=args.fp_to_tp,
            num_trees=args.num_trees,
            max_depth=args.max_depth,
            no_transmitted_singletons=args.no_transmitted_singletons,
            no_inbreeding_coeff=args.no_inbreeding_coeff,
            vqsr_training=args.vqsr_training,
            vqsr_model_id=args.vqsr_model_id,
            filter_centromere_telomere=args.filter_centromere_telomere,
            test_intervals=args.test_intervals,
        )

        ht = ht.checkpoint(
            get_rf_training(model_id=model_id).path, overwrite=args.overwrite,
        )

        logger.info("Adding run to RF run list")
        rf_runs[model_id] = get_run_data(
            input_args={
                "transmitted_singletons": None
                if args.vqsr_training
                else not args.no_transmitted_singletons,
                "adj": args.adj,
                "vqsr_training": args.vqsr_training,
                "filter_centromere_telomere": args.filter_centromere_telomere,
            },
            test_intervals=args.test_intervals,
            features_importance=hl.eval(ht.features_importance),
            test_results=hl.eval(ht.test_results),
        )

        with hl.hadoop_open(rf_run_path(), "w") as f:
            json.dump(rf_runs, f)

        logger.info("Saving RF model")
        save_model(
            rf_model, get_rf_model_path(model_id=model_id), overwrite=args.overwrite,
        )

    else:
        model_id = args.model_id

    if args.apply_rf:
        logger.info(f"Applying RF model {model_id}...")
        rf_model = load_model(get_rf_model_path(model_id=model_id))
        ht = get_rf_training(model_id=model_id).ht()
        features = hl.eval(ht.features)
        ht = apply_rf_model(ht, rf_model, features, label=LABEL_COL)

        logger.info("Finished applying RF model")
        ht = ht.annotate_globals(rf_model_id=model_id)
        ht = ht.checkpoint(
            get_rf_result(model_id=model_id).path, overwrite=args.overwrite,
        )

        ht_summary = ht.group_by(
            "tp", "fp", TRAIN_COL, LABEL_COL, PREDICTION_COL
        ).aggregate(n=hl.agg.count())
        ht_summary.show(n=20)
示例#30
0
def main(args):
    data_type = "exomes" if args.exomes else "genomes"
    hl.init(log=f"/ccdg_sample_qc_{data_type}.log")
    # gcloud compute scp wlu-m:/hard_filter_genomes.log .
    if args.sample_qc:
        compute_sample_qc(data_type).write(
            get_ccdg_results_path(data_type=data_type, result="sample_qc_all"),
            overwrite=args.overwrite,
        )

    if args.impute_sex:
        compute_sex(data_type).write(
            get_ccdg_results_path(data_type=data_type, result="sex"),
            overwrite=args.overwrite,
        )
    # elif args.reannotate_sex:
    #     reannotate_sex(
    #         args.min_cov,
    #         (args.upper_x, (args.lower_xx, args.upper_xx), args.lower_xxx),
    #         ((args.lower_y, args.upper_y), args.lower_yy),
    #     ).write(
    #         get_ccdg_results_path(data_type=data_type, result="sex"),
    #         overwrite=args.overwrite,
    #     )
    ##### Wait for more information
    # if args.compute_hard_filters:
    #     compute_hard_filters(args.min_cov).write(
    #         hard_filtered_samples.path, overwrite=args.overwrite
    #     )

    if args.run_pc_relate or args.reannotate_relatedness:
        if args.run_pc_relate:
            logger.warning(
                "PC-relate requires SSDs and doesn't work with preemptible workers!"
            )
            relatedness_ht = compute_relatedness(
                data_type,
                overwrite=args.overwrite,
            )
        else:
            relatedness_ht = hl.read_table(
                get_ccdg_results_path(data_type=data_type, result="relatedness")
            ).checkpoint(
                "gs://ccdg/tmp/relatedness_ht_checkpoint.ht", overwrite=True
            )  # Copy HT to temp location to overwrite annotation
        relatedness_ht = annotate_relatedness(
            relatedness_ht,
            first_degree_kin_thresholds=tuple(args.first_degree_kin_thresholds),
            second_degree_min_kin=args.second_degree_kin_cutoff,
            ibd0_0_max=args.ibd0_0_max,
        )
        relatedness_ht.write(
            get_ccdg_results_path(data_type=data_type, result="relatedness"),
            overwrite=args.overwrite,
        )

    if args.compute_related_samples_to_drop:
        relatedness_ht = hl.read_table(
            get_ccdg_results_path(data_type=data_type, result="relatedness")
        )
        related_samples_to_remove = hl.maximal_independent_set(
            relartedness_ht.i, pairs.j, False
        ).checkpoint(
            get_ccdg_results_path(data_type=data_type, result="related_samples"),
            overwrite=args.overwrite,
        )

    if args.update_variant_filtered_pca_mt:
        pca_var_ht = hl.read_table(get_pca_variants_path())
        mt = hl.vds.to_dense_mt(get_qc_vds(data_type, split=True))
        mt = mt.filter_rows(hl.is_defined(pca_var_ht[mt.row_key])).checkpoint(
            get_pca_variants_path(ld_pruned=True, data=f"ccdg_{data_type}", mt=True),
            overwrite=args.overwrite,
            _read_if_exists=(not args.overwrite),
        )

    if args.run_pc_project:
        ## TODO: Rank samples and hard filter samples
        mt = hl.read_matrix_table(
            get_pca_variants_path(ld_pruned=True, data=f"ccdg_{data_type}", mt=True)
        )

        pca_loadings = hl.read_table(path_to_gnomad_loadings)

        pca_ht = hl.experimental.pc_project(
            mt.GT,
            pca_loadings.loadings,
            pca_loadings.pca_af,
        )

        pca_ht.checkpoint(
            get_ccdg_results_path(
                data_type=data_type, result="gnomad_pc_project_scores"
            ),
            overwrite=args.overwrite,
        )

        # related_ht = hl.read_table(
        #     get_ccdg_results_path(data_type=data_type, result="related_samples")
        # )
        #
        # related_mt = mt.filter_cols(hl.is_defined(related_mt[mt.col_key]), keep=True)
        # pca_mt = mt.filter_cols(hl.is_defined(related_mt[mt.col_key]), keep=False)

        # pca_ht = hl.experimental.pc_project(
        #     pca_mt.GT, pca_loadings.loadings, pca_loadings.pca_af
        # )
        # pca_mt = pca_mt.annotate_cols(scores=pca_ht[pca_mt.col_key].scores)
        #
        # related_ht = hl.experimental.pc_project(
        #     related_mt.GT, pca_loadings.loadings, pca_loadings.pca_af
        # )
        # related_mt = related_mt.annotate_cols(
        #     scores=related_ht[related_mt.col_key].scores
        # )

    if args.assign_pops:
        with hl.hadoop_open(
            path_to_gnomad_rf,
            "rb",
        ) as f:
            fit = pickle.load(f)

        # Reduce the scores to only those used in the RF model, this was 6 for v2 and 16 for v3.1
        n_pcs = fit.n_features_
        pca_ht = hl.read_table(
            get_ccdg_results_path(
                data_type=data_type, result="gnomad_pc_project_scores"
            )
        )
        pca_ht = pca_ht.annotate(scores=pca_ht.scores[:n_pcs])
        pop_ht, rf_model = assign_population_pcs(
            pca_ht,
            pc_cols=pca_ht.scores,
            fit=fit,
        )

        pop_ht = pop_ht.checkpoint(
            get_ccdg_results_path(data_type=data_type, result="pop_assignment"),
            overwrite=args.overwrite,
            _read_if_exists=not args.overwrite,
        )
        pop_ht.transmute(
            **{f"PC{i + 1}": pop_ht.pca_scores[i] for i in range(n_pcs)}
        ).export(
            get_ccdg_results_path(data_type=data_type, result="pop_assignment")[:-2]
            + "tsv"
        )

        with hl.hadoop_open(
            get_ccdg_results_path(data_type=data_type, result="pop_RF_fit")[:-2]
            + "pickle",
            "wb",
        ) as out:
            pickle.dump(rf_model, out)

    if args.calculate_inbreeding:
        qc_mt = hl.read_matrix_table(
            get_pca_variants_path(ld_pruned=True, data=f"ccdg_{data_type}", mt=True)
        )
        pop_ht = hl.read_table(
            get_ccdg_results_path(data_type=data_type, result="pop_assignment"),
        )
        qc_mt = qc_mt.annotate_cols(pop=pop_ht[qc_mt.col_key].pop)
        qc_mt = qc_mt.annotate_rows(
            call_stats_by_pop=hl.agg.group_by(
                qc_mt.pop, hl.agg.call_stats(qc_mt.GT, qc_mt.alleles)
            )
        )
        inbreeding_ht = (
            qc_mt.annotate_cols(
                inbreeding=hl.agg.inbreeding(
                    qc_mt.GT, qc_mt.call_stats_by_pop[qc_mt.pop].AF[1]
                )
            )
            .cols()
            .select("inbreeding")
        )
        inbreeding_ht.write(
            get_ccdg_results_path(data_type=data_type, result="inbreeding"),
            overwrite=args.overwrite,
        )

    if args.apply_stratified_filters or args.apply_regressed_filters:
        filtering_qc_metrics = args.filtering_qc_metrics.split(",")
        sample_qc_ht = hl.read_table(
            get_ccdg_results_path(data_type=data_type, result="sample_qc_bi_allelic")
        )
        pc_scores = hl.read_table(
            get_ccdg_results_path(data_type=data_type, result="pc_scores")
        )
        sample_qc_ht = sample_qc_ht.select(
            scores=pc_scores[sample_qc_ht.key]["scores"],
        )
        pop_ht = hl.read_table(
            get_ccdg_results_path(data_type=data_type, result="pop_assignment"),
        )

        if "inbreeding" in filtering_qc_metrics:
            inbreeding_ht = hl.read_table(
                get_ccdg_results_path(data_type=data_type, result="inbreeding")
            )[sample_qc_ht.key]
            sample_qc_ht = sample_qc_ht.annotate(
                inbreeding=inbreeding_ht.inbreeding.f_stat
            )

        if args.apply_regressed_filters:
            n_pcs = args.regress_n_pcs
            residuals_ht = compute_qc_metrics_residuals(
                ht=sample_qc_ht,
                pc_scores=sample_qc_ht.scores[:n_pcs],
                qc_metrics={
                    metric: sample_qc_ht[metric] for metric in filtering_qc_metrics
                },
            )
            residuals_ht = residuals_ht.filter(
                hl.is_missing(hard_filtered_samples.ht()[residuals_ht.key])
            )
            stratified_metrics_ht = compute_stratified_metrics_filter(
                ht=residuals_ht,
                qc_metrics=dict(residuals_ht.row_value),
                metric_threshold={
                    "n_singleton_residual": (math.inf, 8.0),
                    "r_het_hom_var_residual": (math.inf, 4.0),
                },
            )

            residuals_ht = residuals_ht.annotate(
                **stratified_metrics_ht[residuals_ht.key]
            )
            residuals_ht = residuals_ht.annotate_globals(
                **stratified_metrics_ht.index_globals(),
                n_pcs=n_pcs,
            )
        else:
            logger.info(
                "Computing stratified QC metrics filters using metrics: "
                + ", ".join(filtering_qc_metrics)
            )
            sample_qc_ht = sample_qc_ht.annotate(qc_pop=pop_ht[sample_qc_ht.key].pop)
            # TODO: compute hard-filtered samples
            sample_qc_ht = sample_qc_ht.filter(
                hl.is_missing(hard_filtered_samples.ht()[sample_qc_ht.key])
            )
            stratified_metrics_ht = compute_stratified_metrics_filter(
                sample_qc_ht,
                qc_metrics={
                    metric: sample_qc_ht[metric] for metric in filtering_qc_metrics
                },
                strata={"qc_pop": sample_qc_ht.qc_pop},
                metric_threshold={"n_singleton": (4.0, 8.0)},
            )
示例#31
0
def tabix(b, ss_path, out_dir):
    r'''
    tabix's a bgz file with gcloud path `path` using Batch `b`
    '''
    fname = ss_path.split('/')[-1]
    f = b.read_input(ss_path)
    j = b.new_job(name=fname.split('.')[0])
    j.command(f'tabix -s 1 -b 2 -e 2 -c chr {f}'
              )  # treat header (which begins with "chr") as a comment
    j.command(f'mv {f}.tbi {j.ofile}')
    b.write_output(j.ofile, f'{out_dir}/{fname}.tbi')


if __name__ == "__main__":
    hl.init(log='/Users/nbaya/Downloads/tabix_sumstats.log')
    backend = hb.ServiceBackend(billing_project='ukb_diverse_pops',
                                bucket='ukbb-diverse-temp-30day/nb-batch-tmp')

    b = hb.Batch(
        name='tabix',
        backend=backend,
        default_image='gcr.io/ukbb-diversepops-neale/nbaya_tabix:latest',
        default_storage='100M',  # works with 2G
        default_cpu=1)

    #    sumstats_dir = f'{bucket}/sumstats_flat_files'
    #    sumstats_dir = f'{ldprune_dir}/export_results/update'
    #    sumstats_dir = f'{ldprune_dir}/loo/sumstats/batch1'
    sumstats_dir = f'{ldprune_dir}/variant_qc'
    print(f'\nUsing sumstats from {sumstats_dir}')
from hail.expr.expressions import *
from hail.expr.expressions import Expression
from hail.typecheck import *
from hail import Table
import hail

from google.cloud import storage
storage.Client()
client = storage.Client()
import gcsfs
fs = gcsfs.GCSFileSystem(project='your-project')
bucket = client.get_bucket('your-bucket')

import hail as hl
import hail.expr.aggregators as agg
hl.init()

#read mt file
mt = hl.read_matrix_table(
    "gs://1k_genome/1000-genomes/VDS-of-all/ALL.chr.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.mt"
)
#print(mt.count()) (39706715, 1092)

#filter MAF
mt = hl.variant_qc(mt)
mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)
#print(mt.count()) (13404583, 1092)

#filter only SNPs
mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))
#print(mt.count()) (12194564, 1092)
示例#33
0
def main():
    parser = argparse.ArgumentParser()
    # reference args
    parser.add_argument(
        '--ref-dirname',
        default=
        'gs://hgdp-1kg/hgdp_tgp/datasets_for_others/lindo/ds_without_outliers/'
    )
    parser.add_argument('--ref-basename', default='unrelated')
    parser.add_argument(
        '--ref-info',
        default=
        'gs://hgdp-1kg/hgdp_tgp/gwaspy_pca_ref/hgdp_1kg_sample_info.unrelateds.pca_outliers_removed.with_project.tsv'
    )
    parser.add_argument('--reference', type=str, default='GRCh38')
    parser.add_argument('--pca-type',
                        type=str,
                        default='normal',
                        choices=['normal', 'project', 'joint'])

    # data args
    parser.add_argument('--data-dirname', type=str, required=True)
    parser.add_argument('--data-basename', type=str, required=True)
    parser.add_argument('--input-type',
                        type=str,
                        required=True,
                        choices=['vcf', 'plink', 'hail'])

    # filter args
    parser.add_argument('--maf',
                        type=float,
                        default=0.05,
                        help='include only SNPs with MAF >= NUM in PCA')
    parser.add_argument('--hwe',
                        type=float,
                        default=1e-3,
                        help='include only SNPs with HWE >= NUM in PCA')
    parser.add_argument('--geno',
                        type=float,
                        default=0.98,
                        help='include only SNPs with call-rate > NUM')
    parser.add_argument(
        '--ld-cor',
        type=float,
        default=0.2,
        choices=range(0, 1),
        metavar="[0.0-1.0]",
        help=
        'Squared correlation threshold (exclusive upper bound). Must be in the range [0.0, 1.0]'
    )
    parser.add_argument(
        '--ld-window',
        type=int,
        default=250000,
        help='Window size in base pairs (inclusive upper bound)')
    parser.add_argument('--npcs',
                        type=int,
                        default=20,
                        help='Number of PCs to use')
    parser.add_argument('--relatedness-method',
                        type=str,
                        default='pc_relate',
                        choices=['pc_relate', 'ibd', 'king'],
                        help='Method to use for the inference of relatedness')
    parser.add_argument('--relatedness-thresh',
                        type=float,
                        default=0.98,
                        help='Threshold value to use in relatedness checks')
    parser.add_argument(
        '--prob',
        type=float,
        default=0.8,
        help=
        'Minimum probability of belonging to a given population for the population to be set'
    )
    parser.add_argument('--out-dir', type=str, required=True)

    args = parser.parse_args()

    if not args.prob:
        print(f'No prob value specified, {args.prob} will be used')

    hl.init(default_reference=args.reference)

    pca(ref_dirname=args.ref_dirname,
        ref_basename=args.ref_basename,
        ref_info=args.ref_info,
        reference=args.reference,
        pca_type=args.pca_type,
        input_type=args.input_type,
        data_dirname=args.data_dirname,
        data_basename=args.data_basename,
        maf=args.maf,
        hwe=args.hwe,
        call_rate=args.geno,
        ld_cor=args.ld_cor,
        ld_window=args.ld_window,
        n_pcs=args.npcs,
        relatedness_method=args.relatedness_method,
        relatedness_thresh=args.relatedness_thresh,
        prob_threshold=args.prob,
        out_dir=args.out_dir)

    print('\nDone running PCA')
示例#34
0
    )
    ht = ht.annotate(
        validated_denovo_inheritance=ht_val_filtered[ht.key].inheritance)

    ht.write(
        f'{lustre_dir}/variant_qc/models/{run_hash}_rf_result_FINAL_for_RANKING_100_trios.ht',
        overwrite=True)


if __name__ == "__main__":
    # need to create spark cluster first before intiialising hail
    sc = pyspark.SparkContext()
    # Define the hail persistent storage directory

    hl.init(sc=sc,
            tmp_dir=lustre_dir,
            local_tmpdir=lustre_dir,
            default_reference="GRCh38")

    # s3 credentials required for user to access the datasets in farm flexible compute s3 environment
    # you may use your own here from your .s3fg file in your home directory
    hadoop_config = sc._jsc.hadoopConfiguration()

    hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"])
    hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"])

    ################################

    #################################

    main()
示例#35
0
import hail as hl
import sys
import timeit

start = timeit.default_timer()

chrom = str(sys.argv[1])

hl.init(log='/hail.log', min_block_size=2048, default_reference='GRCh38')

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# define files
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# input
vds_ldpruned_common_file = 'gs://ccdg-qc-multi/vds/qced/' + chrom + '/ldpruned_common.vds'
vds_1kg_file = 'gs://ccdg-qc-multi/data/1000genomes/vds/hail2_ALL.GRCh38.genotypes.20170504.vds'
mhc_chr8inv_file = 'gs://ccdg-qc-multi/data/MHC_invchr8_longLDreg_liftover_to_GRCh38.txt'
rel_exclusion_file = 'gs://ccdg-qc-multi/out/king/' + chrom + '/ibd_greater_0884_' + chrom + '.txt'
samples_to_keep_file = 'gs://ccdg-qc-multi/qc_measures/' + chrom + '/01_sample_qc_keep.txt'

# output
pca_value_file = 'gs://ccdg-qc-multi/qc_measures/pca/' + chrom + '/pca_values.tsv'
pca_score_file = 'gs://ccdg-qc-multi/qc_measures/pca/' + chrom + '/pca_scores.tsv'

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# read data
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

## interval list
#mhc_chr8inv = hl.import_table(mhc_chr8inv_file, no_header=True).key_by('f0')
示例#36
0
    sys.exit(1)
else:
    filter_constraint = True

if not os.path.isfile(args.vep_config):
    logger.error(f"Could not find vep config file {args.vep_config}")
    sys.exit(1)

# Prepare output path
if not os.path.exists(os.path.abspath(os.path.dirname(args.output))):
    os.makedirs(os.path.abspath(os.path.dirname(args.output)))

# Set hail temporary path
hl.init(
    idempotent=True,
    tmp_dir=args.tmp_dir,
    log=os.path.join(args.tmp_dir, 'hail.log'),
)

##
# Main script
#
logger.info(f"Reading pedigree file {args.fam}")
pedigree = hl.Pedigree.read(args.fam)

logger.info(f"Importing vcf file {args.vcf}")
data = hl.import_vcf(args.vcf,
                     call_fields=['GT'],
                     skip_invalid_loci=True,
                     force_bgz=True)
data = hl.split_multi_hts(data)
示例#37
0
"""

Annotate variant HailTable with allelic frequencies from different (external) sources
(e.g., gnomad exomes and genomes)

"""

import hail as hl

from utils.data_utils import (get_gnomad_genomes_v3_af_ht, get_bonn_af_ht,
                              get_germ_af_ht, get_rum_af_ht,
                              get_vep_annotation_ht)

from utils.generic import current_date

hl.init(default_reference='GRCh38')

nfs_dir = 'file:///home/ubuntu/data'
nfs_tmp = 'file:///home/ubuntu/data/tmp'
hdfs_dir = 'hdfs://spark-master:9820/dir/hail_data'

## import variant table
variant_ht = get_vep_annotation_ht()

## import af tables
# In-hause german allelic frequencies (Tuebingen)
ht_ger_af = get_germ_af_ht()

# In-hause german allelic frequencies (Bonn)
bonn_af = get_bonn_af_ht()
示例#38
0
def main(args):
    hl.init()
    data_type = "genomes" if args.genomes else "exomes"

    if not args.skip_write_qc_mt:
        logger.info("Importing data...")
        # 1h40 for exomes, 3h20 for genomes
        mt = get_gnomad_data(
            data_type, raw=True, split=False
        )  # NOTE: using full calls since hardcalls doesn't exist at this stage
        logger.info(
            "Filtering to bi-allelic, high-callrate, common SNPs for sample QC..."
        )
        mt = mt.filter_rows((hl.len(mt.alleles) == 2)
                            & hl.is_snp(mt.alleles[0], mt.alleles[1])
                            & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > 0.001)
                            & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99))
        mt.annotate_cols(callrate=hl.agg.fraction(hl.is_defined(
            mt.GT))).naive_coalesce(5000).write(qc_mt_path(data_type),
                                                overwrite=args.overwrite)
    qc_mt = hl.read_matrix_table(qc_mt_path(data_type))

    logger.info("Importing metadata...")
    meta_ht = hl.import_table(qc_meta_path(data_type),
                              impute=True,
                              types={
                                  'age': hl.tfloat64
                              }).key_by('s')
    qc_mt = qc_mt.annotate_cols(**meta_ht[qc_mt.s])

    logger.info("Inferring sex...")
    qc_ht = annotate_sex(qc_mt,
                         qc_temp_data_prefix(data_type),
                         male_threshold=0.8 if args.genomes else 0.6).cols()
    # Flag Klinefelter's individuals and samples with sex aneuploidies
    if args.exomes:
        qc_ht = qc_ht.annotate(
            ambiguous_sex=((qc_ht.f_stat >= 0.5) &
                           (hl.is_defined(qc_ht.normalized_y_coverage) &
                            (qc_ht.normalized_y_coverage <= 0.1))) |
            (hl.is_missing(qc_ht.f_stat)) |
            ((qc_ht.f_stat >= 0.4) & (qc_ht.f_stat <= 0.6) &
             (hl.is_defined(qc_ht.normalized_y_coverage) &
              (qc_ht.normalized_y_coverage > 0.1))),
            sex_aneuploidy=(qc_ht.f_stat < 0.4)
            & hl.is_defined(qc_ht.normalized_y_coverage) &
            (qc_ht.normalized_y_coverage > 0.1))
    else:
        qc_ht = qc_ht.annotate(ambiguous_sex=hl.is_missing(qc_ht.is_female))

    logger.info("Annotating samples failing hard filters...")
    if args.exomes:
        sex_expr = (hl.case().when(qc_ht.ambiguous_sex, "ambiguous_sex").when(
            qc_ht.sex_aneuploidy,
            "sex_aneuploidy").when(qc_ht.is_female, "female").default("male"))
    else:
        sex_expr = (hl.case().when(qc_ht.ambiguous_sex, "ambiguous_sex").when(
            qc_ht.is_female, "female").default("male"))
    qc_ht = qc_ht.annotate(
        hard_filters=make_hard_filters_expr(qc_ht, data_type),
        perm_filters=make_perm_filters_expr(qc_ht, data_type),
        sex=sex_expr,
        data_type=data_type).key_by('data_type', 's')
    qc_ht.write(qc_ht_path(data_type, 'hard_filters'),
                overwrite=args.overwrite)

    # Export annotations to make rank list for relatedness (in final sample QC)
    if args.exomes:
        colnames = ['internal', 'project_id', 'pct_bases_20x', 'perm_filters']
    else:
        colnames = ['pcr_free', 'mean_dp', 'perm_filters']
    rank_ht = qc_ht.filter(hl.len(qc_ht.hard_filters) == 0,
                           keep=True).select(*colnames)
    (rank_ht.annotate(releasable=(
        hl.len(rank_ht.perm_filters) == 0)).drop('perm_filters').export(
            rank_annotations_path(data_type)))

    # Check numbers:
    qc_ht = hl.read_table(qc_ht_path(data_type, 'hard_filters'))
    sample_count = qc_ht.count()
    checkpoint1a = qc_ht.aggregate(
        hl.agg.count_where(hl.len(qc_ht['hard_filters']) == 0))
    checkpoint1b = qc_ht.aggregate(
        hl.agg.count_where((hl.len(qc_ht['hard_filters']) == 0)
                           & (hl.len(qc_ht.perm_filters) == 0)))
    logger.info('{} samples found before filtering'.format(sample_count))
    logger.info('{} samples found after checkpoint 1a (hard filters)'.format(
        checkpoint1a))
    logger.info(
        '{} samples found after checkpoint 1b (hard filters + permissions)'.
        format(checkpoint1b))