示例#1
0
def make_perm_filters_expr(ht: hl.Table,
                           data_type: str) -> hl.expr.SetExpression:
    """
    NOTE: syndip will remain dropped wrt to permissions, but all possible QC measures will still be calculated

    :param Table ht: input MT
    :param str data_type: 'exomes' or 'genomes'
    :return: output MT
    :rtype: SetExpression
    """
    if data_type == 'genomes':
        perm_filters = {'not_releasable': ~ht.releasable_2_1}
    else:
        perm_filters = {
            'tcga_tumor': ht.tcga_tumor,
            'tcga_barcode': ht.tcga_weird_barcode,
            'tcga_below_30': ht.tcga_below_30,
            'specific_exclusion': ht.specific_exclusion,
            'esp': ht.esp,
            'not_releasable': ht.non_releasable,
            'syndip': ht.syndip
        }
    return hl.set(
        hl.filter(lambda x: hl.is_defined(x), [
            hl.or_missing(filter_expr, name)
            for name, filter_expr in perm_filters.items()
        ]))
示例#2
0
def set_female_y_metrics_to_na_expr(
        t: Union[hl.Table, hl.MatrixTable]) -> hl.expr.ArrayExpression:
    """
    Set Y-variant frequency callstats for female-specific metrics to missing structs.

    .. note:: Requires freq, freq_meta, and freq_index_dict annotations to be present in Table or MatrixTable

    :param t: Table or MatrixTable for which to adjust female metrics
    :return: Hail array expression to set female Y-variant metrics to missing values
    """
    female_idx = hl.map(
        lambda x: t.freq_index_dict[x],
        hl.filter(lambda x: x.contains("XX"), t.freq_index_dict.keys()),
    )
    freq_idx_range = hl.range(hl.len(t.freq_meta))

    new_freq_expr = hl.if_else(
        (t.locus.in_y_nonpar() | t.locus.in_y_par()),
        hl.map(
            lambda x: hl.if_else(female_idx.contains(x),
                                 missing_callstats_expr(), t.freq[x]),
            freq_idx_range,
        ),
        t.freq,
    )

    return new_freq_expr
示例#3
0
def make_hard_filters_expr(ht: hl.Table,
                           data_type: str) -> hl.expr.SetExpression:
    """
    NOTE: additional metadata in Kristen's import file is hard-coded

    :param: Table ht: input MT
    :param: str data_type: 'exomes' or 'genomes'
    :return: output MT
    :rtype: SetExpression
    """
    hard_filters = {
        'contamination': ht.freemix > 0.05,
        'callrate': ht.callrate < 0.85,
        'chimera': ht.pct_chimeras > 0.05,
        'ambiguous_sex': ht.ambiguous_sex
    }

    if data_type == 'exomes':
        hard_filters.update({
            'coverage': ht.mean_chr20_coverage == 0,
            'sex_aneuploidy': ht.sex_aneuploidy
        })
    else:
        hard_filters.update({
            'coverage': ht.mean_dp < 15,
            'insert_size': ht.median_insert_size < 250
        })
    return hl.set(
        hl.filter(lambda x: hl.is_defined(x), [
            hl.or_missing(filter_expr, name)
            for name, filter_expr in hard_filters.items()
        ]))
示例#4
0
 def make_pop_filters_expr(ht: hl.Table,
                           qc_metrics: List[str]) -> hl.expr.SetExpression:
     return hl.set(
         hl.filter(lambda x: hl.is_defined(x), [
             hl.or_missing(ht[f'fail_{metric}'], metric)
             for metric in qc_metrics
         ]))
def add_popmax_expr(freq: hl.expr.ArrayExpression,
                    freq_meta: hl.expr.ArrayExpression,
                    populations: Set[str]) -> hl.expr.ArrayExpression:
    """
    Calculates popmax (add an additional entry into freq with popmax: pop)

    :param ArrayExpression freq: ArrayExpression of Structs with ['ac', 'an', 'hom']
    :param ArrayExpression freq_meta: ArrayExpression of meta dictionaries corresponding to freq
    :param set of str populations: Set of populations over which to calculate popmax
    :return: Frequency data with annotated popmax
    :rtype: ArrayExpression
    """
    pops_to_use = hl.literal(populations)
    freq = hl.map(lambda x: x[0].annotate(meta=x[1]), hl.zip(freq, freq_meta))
    freq_filtered = hl.filter(
        lambda f: (f.meta.size() == 2) & (f.meta.get('group') == 'adj') &
        pops_to_use.contains(f.meta.get('pop')) & (f.AC > 0), freq)
    sorted_freqs = hl.sorted(freq_filtered, key=lambda x: x.AF, reverse=True)
    return hl.or_missing(
        hl.len(sorted_freqs) > 0,
        hl.struct(AC=sorted_freqs[0].AC,
                  AF=sorted_freqs[0].AF,
                  AN=sorted_freqs[0].AN,
                  homozygote_count=sorted_freqs[0].homozygote_count,
                  pop=sorted_freqs[0].meta['pop']))
 def make_filters_expr(ht: hl.Table,
                       qc_metrics: Iterable[str]) -> hl.expr.SetExpression:
     return hl.set(
         hl.filter(
             lambda x: hl.is_defined(x),
             [
                 hl.or_missing(ht[f"fail_{metric}"], metric)
                 for metric in qc_metrics
             ],
         ))
示例#7
0
def get_expression_proportion(tx_table, tissues_to_filter, gene_maximum_ht):

    if tissues_to_filter:
        print("Filtering tissues:", tissues_to_filter)
        tx_table = tx_table.drop(*tissues_to_filter)

    remaining_tissue_columns = list(
        set(tx_table.row) -
        {'locus', 'alleles', 'csq', 'ensg', 'symbol', 'lof', 'lof_flag'})

    tx_table = tx_table.annotate(tx_expression={
        tissue_id: tx_table[tissue_id]
        for tissue_id in remaining_tissue_columns
    })

    tx_table = tx_table.key_by('ensg').join(gene_maximum_ht.key_by("ensg"))

    expression_proportion_table = tx_table.annotate(
        expression_proportion_dict={
            tissue_id: tx_table.tx_expression[tissue_id] /
            tx_table.gene_maximum_dict[tissue_id]
            for tissue_id in remaining_tissue_columns
        })

    columns_to_drop = list(
        set(expression_proportion_table.row) - {
            'locus', 'alleles', 'csq', 'ensg', 'symbol', 'lof', 'lof_flag',
            'expression_proportion_dict'
        })

    expression_proportion_table = expression_proportion_table.drop(
        *columns_to_drop)

    expression_proportion_table = expression_proportion_table.annotate(
        **{
            tissue_id:
            expression_proportion_table.expression_proportion_dict[tissue_id]
            for tissue_id in remaining_tissue_columns
        })

    expression_proportion_table = expression_proportion_table.annotate(
        mean_proportion=hl.mean(hl.filter(lambda e: ~hl.is_nan(e), [
            expression_proportion_table[tissue_id]
            for tissue_id in remaining_tissue_columns
        ]),
                                filter_missing=True))

    expression_proportion_table = expression_proportion_table.drop(
        expression_proportion_table.expression_proportion_dict).key_by(
            'locus', 'alleles', 'ensg')

    return expression_proportion_table
示例#8
0
def add_variant_type(alt_alleles: hl.expr.ArrayExpression) -> hl.expr.StructExpression:
    """
    Get Struct of variant_type and n_alt_alleles from ArrayExpression of Strings (all alleles)
    """
    ref = alt_alleles[0]
    alts = alt_alleles[1:]
    non_star_alleles = hl.filter(lambda a: a != '*', alts)
    return hl.struct(variant_type=hl.cond(
        hl.all(lambda a: hl.is_snp(ref, a), non_star_alleles),
        hl.cond(hl.len(non_star_alleles) > 1, "multi-snv", "snv"),
        hl.cond(
            hl.all(lambda a: hl.is_indel(ref, a), non_star_alleles),
            hl.cond(hl.len(non_star_alleles) > 1, "multi-indel", "indel"),
            "mixed")
    ), n_alt_alleles=hl.len(non_star_alleles))
示例#9
0
def annotate_fields(mt, gencode_release, gencode_path):
    genotypes = hl.agg.collect(
        hl.struct(sample_id=mt.s,
                  gq=mt.GQ,
                  cn=mt.RD_CN,
                  num_alt=hl.if_else(hl.is_defined(mt.GT),
                                     mt.GT.n_alt_alleles(), -1)))
    rows = mt.annotate_rows(genotypes=genotypes).rows()

    rows = rows.annotate(**{k: v(rows) for k, v in CORE_FIELDS.items()})

    gene_id_mapping = hl.literal(
        load_gencode(gencode_release, download_path=gencode_path))

    rows = rows.annotate(
        sortedTranscriptConsequences=hl.flatmap(
            lambda x: x,
            hl.filter(lambda x: hl.is_defined(x), [
                rows.info[col].map(lambda gene: hl.struct(
                    gene_symbol=gene,
                    gene_id=gene_id_mapping[gene],
                    predicted_consequence=col.split('__')[-1])) for col in [
                        gene_col for gene_col in rows.info
                        if gene_col.startswith('PROTEIN_CODING__')
                        and rows.info[gene_col].dtype == hl.dtype('array<str>')
                    ]
            ])),
        sv_type=rows.alleles[1].replace('[<>]', '').split(':', 2),
    )

    DERIVED_FIELDS.update({
        'filters':
        lambda rows: hl.if_else(
            hl.len(rows.filters) > 0, rows.filters,
            hl.missing(hl.dtype('array<str>')))
    })
    rows = rows.annotate(**{k: v(rows) for k, v in DERIVED_FIELDS.items()})

    rows = rows.rename({'rsid': 'variantId'})

    return rows.key_by().select(*FIELDS)
示例#10
0
def add_popmax_expr(freq: hl.expr.ArrayExpression) -> hl.expr.ArrayExpression:
    """
    Calculates popmax (add an additional entry into freq with popmax: pop)

    :param ArrayExpression freq: ArrayExpression of Structs with ['ac', 'an', 'hom', 'meta']
    :return: Frequency data with annotated popmax
    :rtype: ArrayExpression
    """
    freq_filtered = hl.filter(
        lambda x:
        (x.meta.keys() == ['population']) & (x.meta['population'] != 'oth'),
        freq)
    sorted_freqs = hl.sorted(freq_filtered,
                             key=lambda x: x.ac / x.an,
                             reverse=True)
    return hl.cond(
        hl.len(sorted_freqs) > 0,
        freq.append(
            hl.struct(ac=sorted_freqs[0].ac,
                      an=sorted_freqs[0].an,
                      hom=sorted_freqs[0].hom,
                      meta={'popmax': sorted_freqs[0].meta['population']})),
        freq)
示例#11
0
def apply_filter_flags_expr(
    mt: hl.MatrixTable, data_type: str, metric_thresholds: dict
) -> hl.expr.SetExpression:
    """
    Annotates table with flags for elevated contamination and chimera as well as low coverage and call rate
    :param Table mt: input MatrixTable
    :param str data_type: 'WES' or 'WGS' for selecting coverage threshold
    :param dict metric_thresholds: dictionary where key is metric and value is threshold value
    :return: Set of sequencing metric flags
    :rtype: SetExpression
    """
    flags = {
        "callrate": mt.filtered_callrate < metric_thresholds["callrate_thres"],
        "contamination": mt.PCT_CONTAMINATION
        > metric_thresholds[
            "contam_thres"
        ],  # TODO revisit current thresholds and rename once have to kristen's script output
        "chimera": mt.AL_PCT_CHIMERAS > metric_thresholds["chimera_thres"],
    }
    if data_type == "WES":
        flags.update(
            {
                "coverage": mt.HS_PCT_TARGET_BASES_20X
                < metric_thresholds["wes_cov_thres"]
            }
        )
    else:
        flags.update(
            {"coverage": mt.WGS_MEAN_COVERAGE < metric_thresholds["wgs_cov_thres"]}
        )

    return hl.set(
        hl.filter(
            lambda x: hl.is_defined(x),
            [hl.or_missing(filter_expr, name) for name, filter_expr in flags.items()],
        )
    )
示例#12
0
def main(args):
    # Initializing Hail on cluster mode
    init_hail_on_cluster(tmp_dir=HAIL_TMP_DIR,
                         log_file=HAIL_LOG_PATH,
                         local_mode=True)

    # 1- Aggregate MatrixTable per gene/consequences creating gene/csq X sample matrix

    # Read MatrixTable
    mt = hl.read_matrix_table(args.mt_input_path)

    # Annotate csq group info per variants
    # Define consequences variant rules with hail expressions
    # TODO: check if field exist in dataset
    csq_group_rules = {}
    if args.ptv:
        csq_group_rules.update({'PTV': mt.csq_type == 'PTV'})
    if args.pav:
        csq_group_rules.update({'PAV': mt.csq_type == 'PAV'})
    if args.syn:
        csq_group_rules.update({'SYN': mt.csq_type == 'SYN'})
    if args.cadd:
        sq_group_rules.update({
            'CADD':
            (mt.csq_type == 'PAV') & (mt.cadd_phred >= args.cadd_threshold)
        })
    if args.mpc:
        csq_group_rules.update(
            {'MPC': (mt.csq_type == 'PAV') & (mt.mpc >= args.mpc_threshold)})

    # Annotate groups per variants
    mt = (mt.annotate_rows(csq_group=csq_group_rules))

    # Transmute csq_group and convert to set (easier to explode and grouping later)
    mt = (mt.transmute_rows(csq_group=hl.set(
        hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys()))))

    # Explode nested csq_group before grouping
    mt = (mt.explode_rows(mt.csq_group))

    # Group mt by gene/csq_group.
    mt_grouped = (mt.group_rows_by(
        mt.csq_group, mt.symbol).partition_hint(100).aggregate(
            n_het=hl.agg.count_where(mt.GT.is_het())))

    # 2- Annotate gene set information

    # Import/parsing gene cluster table
    clusters = hl.import_table(args.gene_set_path, no_header=True)

    # parsing gene set column
    clusters = (clusters.transmute(genes=hl.set(clusters['f1'].split(
        delim='[|]'))))

    clusters = (clusters.explode(clusters.genes))

    clusters = (clusters.group_by('genes').partition_hint(100).aggregate(
        cluster_name=hl.agg.collect_as_set(clusters['f0'])).key_by('genes'))

    # annotate gene set info
    mt_grouped = (mt_grouped.annotate_rows(
        cluster_name=clusters[mt_grouped.symbol].cluster_name))

    # 3- Aggregate per gene set and consequences

    # Group mt by gene set/csq_group.
    mt_grouped = (mt_grouped.explode_rows(mt_grouped.cluster_name))
    mt_grouped = (mt_grouped.group_rows_by(
        mt_grouped.cluster_name,
        mt_grouped.csq_group).partition_hint(100).aggregate(
            n_het=hl.agg.sum(mt_grouped.n_het)))

    # force to eval all aggregation operation by writing mt to disk
    mt_grouped = mt_grouped.persist(storage_level='DISK_ONLY')

    if args.logistic_regression:
        # covariates list
        covs = list(args.covs_list)

        # Define x expression (entries/genotype)
        x_expr = 'n_het'

        extra_annotations = {'analysis': 'all_cases', 'covariates': covs}

        tb_stats = logistic_regression(mt=mt_grouped,
                                       x_expr=x_expr,
                                       response=args.phenotype_field,
                                       covs=covs,
                                       pass_through=[],
                                       extra_fields=extra_annotations)
        # export table
        tb_stats.export(args.output_path)

    if args.fet:
        None  # TODO: implement Fisher Exact-based burden gene set test

    hl.stop()
def main(args):

    # Initializing Hail on cluster mode
    hl.init()

    # 1- Aggregate MatrixTable per gene/consequences creating gene/csq X sample matrix
    # Read MatrixTable
    mt = hl.read_matrix_table(args.mt_input_path)

    # Annotate csq group info per variants
    # Define consequences variant rules with hail expressions
    # TODO: check if fields exist in dataset
    csq_group_rules = {}
    if args.ptv:
        csq_group_rules.update({'PTV': mt.csq_type == 'PTV'})
    if args.pav:
        csq_group_rules.update({'PAV': mt.csq_type == 'PAV'})
    if args.syn:
        csq_group_rules.update({'SYN': mt.csq_type == 'SYN'})
    if args.cadd:
        csq_group_rules.update({
            'CADD':
            (mt.csq_type == 'PAV') & (mt.cadd_phred >= args.cadd_threshold)
        })

    # Annotate groups per variants
    mt = (mt.annotate_rows(csq_group=csq_group_rules))

    # Transmute csq_group and convert to set (easier to explode and grouping later)
    mt = (mt.transmute_rows(csq_group=hl.set(
        hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys()))))

    # Explode nested csq_group before grouping
    mt = (mt.explode_rows(mt.csq_group))

    # Group mt by gene/csq_group.
    mt_grouped = (mt.group_rows_by(
        mt.csq_group, mt.symbol).partition_hint(100).aggregate(
            n_het=hl.agg.count_where(mt.GT.is_het())))

    # force to eval all aggregation operation by writing mt to disk
    # mt_grouped = mt_grouped.persist(storage_level='DISK_ONLY')

    if args.logistic_regression:
        # covariates list
        covs = list(args.covs_list)

        # Define x expression (entries/genotype)
        x_expr = 'n_het'

        extra_annotations = {'analysis': 'all_cases', 'covariates': covs}

        tb_stats = logistic_regression(mt=mt_grouped,
                                       x_expr=x_expr,
                                       response=args.phenotype_field,
                                       covs=covs,
                                       pass_through=[],
                                       extra_fields=extra_annotations)
        # export table
        tb_stats.export(args.output_path)

    if args.fet:
        None  # TODO: implement gene-based Fisher Exact burden test

    hl.stop()
示例#14
0
def main(args):
    hl.init(default_reference=args.default_ref_genome)

    if args.run_test_mode:
        logger.info('Running pipeline on test data...')
        mt = (get_mt_data(part='raw_chr20').sample_rows(0.1))
    else:
        logger.info(
            'Running pipeline on MatrixTable wih adjusted genotypes...')
        ds = args.exome_cohort
        mt = hl.read_matrix_table(
            get_qc_mt_path(dataset=ds,
                           part='unphase_adj_genotypes',
                           split=True))

    # 1. Sample-QC filtering
    if not args.skip_sample_qc_filtering:
        logger.info('Applying per sample QC filtering...')

        mt = apply_sample_qc_filtering(mt)

        logger.info(
            'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt',
                       overwrite=True))

    # 2. Variant-QC filtering
    if not args.skip_variant_qc_filtering:

        logger.info('Applying per variant QC filtering...')

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'):
            logger.info('Reading pre-existing sample qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt')
        mt = apply_variant_qc_filtering(mt)

        # write hard filtered MT to disk
        logger.info(
            'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt',
                       overwrite=True))

    # 3. Annotate AFs

    # allelic frequency cut-off
    maf_cutoff = args.af_max_threshold

    if not args.skip_af_filtering:

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'):
            logger.info(
                'Reading pre-existing sample/variant qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt')

        # Annotate allelic frequencies from external source,
        # and compute internal AF on samples passing QC
        af_ht = get_af_annotation_ht()

        mt = (mt.annotate_rows(**af_ht[mt.row_key]))

        filter_expressions = [
            af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff)
        ]

        mt = (mt.filter_rows(functools.reduce(operator.iand,
                                              filter_expressions),
                             keep=True))

        logger.info(
            'Writing qc-filtered MT filtered to external maf with to disk...')
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt',
                       overwrite=True))

    # 4. ##### Burden Test ######

    logger.info('Running burden test...')

    if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'):
        logger.info(
            'Reading pre-existing sample/variant qc-filtered MT with rare variants...'
        )
        mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt')

    ## Add VEP-annotated fields
    vep_ht = get_vep_annotation_ht()

    mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF,
                           Consequence=vep_ht[mt.row_key].vep.Consequence,
                           DOMAINS=vep_ht[mt.row_key].vep.DOMAINS,
                           SYMBOL=vep_ht[mt.row_key].vep.SYMBOL))

    ## Filter to bi-allelic variants
    if args.filter_biallelic:
        logger.info('Running burden test on biallelic variants...')
        mt = mt.filter_rows(bi_allelic_expr(mt))

    ## Filter to variants within protein domain(s)
    if args.filter_protein_domain:
        logger.info(
            'Running burden test on variants within protein domain(s)...')
        mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS),
                            keep=True)

    ## Add cases/controls sample annotations
    tb_sample = get_sample_meta_data()
    mt = (mt.annotate_cols(**tb_sample[mt.s]))

    mt = (mt.filter_cols(mt['phe.is_case'] | mt['phe.is_control']))

    ## Annotate pathogenic scores
    ht_scores = get_vep_scores_ht()
    mt = mt.annotate_rows(**ht_scores[mt.row_key])

    ## Classify variant into (major) consequence groups
    score_expr_ann = {
        'hcLOF': mt.LoF == 'HC',
        'syn': mt.Consequence == 'synonymous_variant',
        'miss': mt.Consequence == 'missense_variant'
    }

    # Update dict expr annotations with combinations of variant consequences categories
    score_expr_ann.update({
        'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD),
                          (mt['vep.REVEL_score'] >= REVEL_THRESHOLD),
                          (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2)
        & score_expr_ann.get('miss')
    })

    score_expr_ann.update({
        'hcLOF_missC':
        score_expr_ann.get('hcLOF') | score_expr_ann.get('missC')
    })

    mt = (mt.annotate_rows(csq_group=score_expr_ann))

    # Transmute csq_group and convert dict to set where the group is defined
    # (easier to explode and grouping later)
    mt = (mt.transmute_rows(csq_group=hl.set(
        hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys()))))

    mt = (mt.filter_rows(hl.len(mt.csq_group) > 0))

    # Explode nested csq_group before grouping
    mt = (mt.explode_rows(mt.csq_group))

    # print('Number of samples/variants: ')
    # print(mt.count())

    # Group mt by gene/csq_group.
    mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate(
        hets=hl.agg.any(mt.GT.is_het()),
        homs=hl.agg.any(mt.GT.is_hom_var()),
        chets=hl.agg.count_where(mt.GT.is_het()) >= 2,
        homs_chets=(hl.agg.count_where(mt.GT.is_het()) >= 2) |
        (hl.agg.any(mt.GT.is_hom_var()))).repartition(100).persist())
    mts = []

    if args.homs:
        # select homs genotypes.

        mt_homs = (mt_grouped.select_entries(
            mac=mt_grouped.homs).annotate_rows(agg_genotype='homs'))

        mts.append(mt_homs)

    if args.chets:
        # select compound hets (chets) genotypes.
        mt_chets = (mt_grouped.select_entries(
            mac=mt_grouped.chets).annotate_rows(agg_genotype='chets'))

        mts.append(mt_chets)

    if args.homs_chets:
        # select chets and/or homs genotypes.
        mt_homs_chets = (mt_grouped.select_entries(
            mac=mt_grouped.homs_chets).annotate_rows(
                agg_genotype='homs_chets'))

        mts.append(mt_homs_chets)

    if args.hets:
        # select hets genotypes
        mt_hets = (mt_grouped.select_entries(
            mac=mt_grouped.hets).annotate_rows(agg_genotype='hets'))

        mts.append(mt_hets)

    ## Joint MatrixTables
    mt_grouped = hl.MatrixTable.union_rows(*mts)

    # Generate table of counts
    tb_gene = (mt_grouped.annotate_rows(
        n_cases=hl.agg.filter(mt_grouped['phe.is_case'],
                              hl.agg.sum(mt_grouped.mac)),
        n_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'],
                                  hl.agg.sum(mt_grouped.mac)),
        n_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'],
                                     hl.agg.sum(mt_grouped.mac)),
        n_controls=hl.agg.filter(mt_grouped['phe.is_control'],
                                 hl.agg.sum(mt_grouped.mac)),
        n_total_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.count()),
        n_total_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'],
                                        hl.agg.count()),
        n_total_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'],
                                           hl.agg.count()),
        n_total_controls=hl.agg.filter(mt_grouped['phe.is_control'],
                                       hl.agg.count())).rows())

    # run fet stratified by proband type
    analysis = ['all_cases', 'syndromic', 'nonsyndromic']

    tbs = []
    for proband in analysis:
        logger.info(f'Running test for {proband}...')
        colCases = None
        colTotalCases = None
        colControls = 'n_controls'
        colTotalControls = 'n_total_controls'
        if proband == 'all_cases':
            colCases = 'n_cases'
            colTotalCases = 'n_total_cases'
        if proband == 'syndromic':
            colCases = 'n_syndromic'
            colTotalCases = 'n_total_syndromic'
        if proband == 'nonsyndromic':
            colCases = 'n_nonsyndromic'
            colTotalCases = 'n_total_nonsyndromic'

        tb_fet = compute_fisher_exact(tb=tb_gene,
                                      n_cases_col=colCases,
                                      n_control_col=colControls,
                                      total_cases_col=colTotalCases,
                                      total_controls_col=colTotalControls,
                                      correct_total_counts=True,
                                      root_col_name='fet',
                                      extra_fields={
                                          'analysis': proband,
                                          'maf': maf_cutoff
                                      })

        # filter out zero-count genes
        tb_fet = (tb_fet.filter(
            hl.sum([tb_fet[colCases], tb_fet[colControls]]) > 0, keep=True))

        tbs.append(tb_fet)

    tb_final = hl.Table.union(*tbs)

    tb_final.describe()

    # export results
    date = current_date()
    run_hash = str(uuid.uuid4())[:6]
    output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.fet_burden.{run_hash}.ht'

    tb_final = (tb_final.checkpoint(output=output_path))

    if args.write_to_file:
        # write table to disk as TSV file
        (tb_final.export(f'{output_path}.tsv'))

    hl.stop()
示例#15
0
def main(args):
    hl.init()

    # Read in all sumstats
    mt = load_final_sumstats_mt(filter_phenos=True,
                                filter_variants=False,
                                filter_sumstats=True,
                                separate_columns_by_pop=False,
                                annotate_with_nearest_gene=False)

    # Annotate per-entry sample size
    def get_n(pheno_data, i):
        return pheno_data[i].n_cases + hl.or_else(pheno_data[i].n_controls, 0)

    mt = mt.annotate_entries(summary_stats=hl.map(
        lambda x: x[1].annotate(N=hl.or_missing(hl.is_defined(x[1]),
                                                get_n(mt.pheno_data, x[0]))),
        hl.zip_with_index(mt.summary_stats)))

    # Exclude entries with low confidence flag.
    if not args.keep_low_confidence_variants:
        mt = mt.annotate_entries(summary_stats=hl.map(
            lambda x: hl.or_missing(~x.low_confidence, x), mt.summary_stats))

    # Run fixed-effect meta-analysis (all + leave-one-out)
    mt = mt.annotate_entries(unnorm_beta=mt.summary_stats.BETA /
                             (mt.summary_stats.SE**2),
                             inv_se2=1 / (mt.summary_stats.SE**2))
    mt = mt.annotate_entries(
        sum_unnorm_beta=all_and_leave_one_out(mt.unnorm_beta,
                                              mt.pheno_data.pop),
        sum_inv_se2=all_and_leave_one_out(mt.inv_se2, mt.pheno_data.pop))
    mt = mt.transmute_entries(META_BETA=mt.sum_unnorm_beta / mt.sum_inv_se2,
                              META_SE=hl.map(lambda x: hl.sqrt(1 / x),
                                             mt.sum_inv_se2))
    mt = mt.annotate_entries(
        META_Pvalue=hl.map(lambda x: 2 * hl.pnorm(x), -hl.abs(mt.META_BETA /
                                                              mt.META_SE)))

    # Run heterogeneity test (Cochran's Q)
    mt = mt.annotate_entries(META_Q=hl.map(
        lambda x: hl.sum((mt.summary_stats.BETA - x)**2 * mt.inv_se2),
        mt.META_BETA),
                             variant_exists=hl.map(lambda x: ~hl.is_missing(x),
                                                   mt.summary_stats.BETA))
    mt = mt.annotate_entries(META_N_pops=all_and_leave_one_out(
        mt.variant_exists, mt.pheno_data.pop))
    mt = mt.annotate_entries(META_Pvalue_het=hl.map(
        lambda i: hl.pchisqtail(mt.META_Q[i], mt.META_N_pops[i] - 1),
        hl.range(hl.len(mt.META_Q))))

    # Add other annotations
    mt = mt.annotate_entries(
        ac_cases=hl.map(lambda x: x["AF.Cases"] * x.N, mt.summary_stats),
        ac_controls=hl.map(lambda x: x["AF.Controls"] * x.N, mt.summary_stats),
        META_AC_Allele2=all_and_leave_one_out(
            mt.summary_stats.AF_Allele2 * mt.summary_stats.N,
            mt.pheno_data.pop),
        META_N=all_and_leave_one_out(mt.summary_stats.N, mt.pheno_data.pop))
    mt = mt.annotate_entries(
        META_AF_Allele2=mt.META_AC_Allele2 / mt.META_N,
        META_AF_Cases=all_and_leave_one_out(mt.ac_cases, mt.pheno_data.pop) /
        mt.META_N,
        META_AF_Controls=all_and_leave_one_out(mt.ac_controls,
                                               mt.pheno_data.pop) / mt.META_N)
    mt = mt.drop('unnorm_beta', 'inv_se2', 'variant_exists', 'ac_cases',
                 'ac_controls', 'summary_stats', 'META_AC_Allele2')

    # Format everything into array<struct>
    def is_finite_or_missing(x):
        return (hl.or_missing(hl.is_finite(x), x))

    meta_fields = [
        'BETA', 'SE', 'Pvalue', 'Q', 'Pvalue_het', 'N', 'N_pops', 'AF_Allele2',
        'AF_Cases', 'AF_Controls'
    ]
    mt = mt.transmute_entries(meta_analysis=hl.map(
        lambda i: hl.struct(
            **{
                field: is_finite_or_missing(mt[f'META_{field}'][i])
                for field in meta_fields
            }), hl.range(hl.len(mt.META_BETA))))

    col_fields = ['n_cases', 'n_controls']
    mt = mt.annotate_cols(
        **{
            field: all_and_leave_one_out(mt.pheno_data[field],
                                         mt.pheno_data.pop)
            for field in col_fields
        })
    col_fields += ['pop']
    mt = mt.annotate_cols(pop=all_and_leave_one_out(
        mt.pheno_data.pop,
        mt.pheno_data.pop,
        all_f=lambda x: x,
        loo_f=lambda i, x: hl.filter(lambda y: y != x[i], x),
    ))
    mt = mt.transmute_cols(meta_analysis_data=hl.map(
        lambda i: hl.struct(**{field: mt[field][i]
                               for field in col_fields}),
        hl.range(hl.len(mt.pop))))

    mt.describe()
    mt.write(get_meta_analysis_results_path(), overwrite=args.overwrite)

    hl.copy_log('gs://ukb-diverse-pops/combined_results/meta_analysis.log')
示例#16
0
def merge_alleles(alleles) -> ArrayExpression:
    # alleles is tarray(tarray(tstruct(ref=tstr, alt=tstr)))
    return hl.rbind(hl.array(hl.set(hl.flatten(alleles))),
                    lambda arr:
                    hl.filter(lambda a: a.alt != '<NON_REF>', arr)
                      .extend(hl.filter(lambda a: a.alt == '<NON_REF>', arr)))
示例#17
0
def ld_score_regression(weight_expr,
                        ld_score_expr,
                        chi_sq_exprs,
                        n_samples_exprs,
                        n_blocks=200,
                        two_step_threshold=30,
                        n_reference_panel_variants=None) -> Table:
    r"""Estimate SNP-heritability and level of confounding biases from
    GWAS summary statistics.

    Given a set or multiple sets of genome-wide association study (GWAS)
    summary statistics, :func:`.ld_score_regression` estimates the heritability
    of a trait or set of traits and the level of confounding biases present in
    the underlying studies by regressing chi-squared statistics on LD scores,
    leveraging the model:

    .. math::

        \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j

    *  :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic
       for variant :math:`j` resulting from a test of association between
       variant :math:`j` and a trait.
    *  :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant
       :math:`j`, calculated as the sum of squared correlation coefficients
       between variant :math:`j` and nearby variants. See :func:`ld_score`
       for further details.
    *  :math:`a` captures the contribution of confounding biases, such as
       cryptic relatedness and uncontrolled population structure, to the
       association test statistic.
    *  :math:`h_g^2` is the SNP-heritability, or the proportion of variation
       in the trait explained by the effects of variants included in the
       regression model above.
    *  :math:`M` is the number of variants used to estimate :math:`h_g^2`.
    *  :math:`N` is the number of samples in the underlying association study.

    For more details on the method implemented in this function, see:

    * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__

    Examples
    --------

    Run the method on a matrix table of summary statistics, where the rows
    are variants and the columns are different phenotypes:

    >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=mt_gwas['ld_score'],
    ...     ld_score_expr=mt_gwas['ld_score'],
    ...     chi_sq_exprs=mt_gwas['chi_squared'],
    ...     n_samples_exprs=mt_gwas['n'])


    Run the method on a table with summary statistics for a single
    phenotype:

    >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=ht_gwas['chi_squared_50_irnt'],
    ...     n_samples_exprs=ht_gwas['n_50_irnt'])

    Run the method on a table with summary statistics for multiple
    phenotypes:

    >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'],
    ...                        ht_gwas['chi_squared_20160']],
    ...     n_samples_exprs=[ht_gwas['n_50_irnt'],
    ...                      ht_gwas['n_20160']])

    Notes
    -----
    The ``exprs`` provided as arguments to :func:`.ld_score_regression`
    must all be from the same object, either a :class:`Table` or a
    :class:`MatrixTable`.

    **If the arguments originate from a table:**

    *  The table must be keyed by fields ``locus`` of type
       :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of
       :py:data:`.tstr` elements.
    *  ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and
       ``n_samples_exprs`` are must be row-indexed fields.
    *  The number of expressions passed to ``n_samples_exprs`` must be
       equal to one or the number of expressions passed to
       ``chi_sq_exprs``. If just one expression is passed to
       ``n_samples_exprs``, that sample size expression is assumed to
       apply to all sets of statistics passed to ``chi_sq_exprs``.
       Otherwise, the expressions passed to ``chi_sq_exprs`` and
       ``n_samples_exprs`` are matched by index.
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have generic :obj:`int` values
       ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc.
       expressions passed to the ``chi_sq_exprs`` argument.

    **If the arguments originate from a matrix table:**

    *  The dimensions of the matrix table must be variants
       (rows) by phenotypes (columns).
    *  The rows of the matrix table must be keyed by fields
       ``locus`` of type :class:`.tlocus` and ``alleles``,
       a :py:data:`.tarray` of :py:data:`.tstr` elements.
    *  The columns of the matrix table must be keyed by a field
       of type :py:data:`.tstr` that uniquely identifies phenotypes
       represented in the matrix table. The column key must be a single
       expression; compound keys are not accepted.
    *  ``weight_expr`` and ``ld_score_expr`` must be row-indexed
       fields.
    *  ``chi_sq_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  ``n_samples_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have values corresponding to the
       column keys of the input matrix table.

    This function returns a :class:`Table` with one row per set of summary
    statistics passed to the ``chi_sq_exprs`` argument. The following
    row-indexed fields are included in the table:

    *  **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The
       returned table is keyed by this field. See the notes below for
       details on the possible values of this field.
    *  **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared
       test statistic for the given phenotype.
    *  **intercept** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          intercept :math:`1 + Na`.
       -  **standard_error**  (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    *  **snp_heritability** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          SNP-heritability :math:`h_g^2`.
       -  **standard_error** (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    Warning
    -------
    :func:`.ld_score_regression` considers only the rows for which both row
    fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing
    values in either field are removed prior to fitting the LD score
    regression model.

    Parameters
    ----------
    weight_expr : :class:`.Float64Expression`
                  Row-indexed expression for the LD scores used to derive
                  variant weights in the model.
    ld_score_expr : :class:`.Float64Expression`
                    Row-indexed expression for the LD scores used as covariates
                    in the model.
    chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of
                        :class:`.Float64Expression`
                        One or more row-indexed (if table) or entry-indexed
                        (if matrix table) expressions for chi-squared
                        statistics resulting from genome-wide association
                        studies.
    n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of
                     :class:`.NumericExpression`
                     One or more row-indexed (if table) or entry-indexed
                     (if matrix table) expressions indicating the number of
                     samples used in the studies that generated the test
                     statistics supplied to ``chi_sq_exprs``.
    n_blocks : :obj:`int`
               The number of blocks used in the jackknife approach to
               estimating standard errors.
    two_step_threshold : :obj:`int`
                         Variants with chi-squared statistics greater than this
                         value are excluded in the first step of the two-step
                         procedure used to fit the model.
    n_reference_panel_variants : :obj:`int`, optional
                                 Number of variants used to estimate the
                                 SNP-heritability :math:`h_g^2`.

    Returns
    -------
    :class:`.Table`
        Table keyed by ``phenotype`` with intercept and heritability estimates
        for each phenotype passed to the function."""

    chi_sq_exprs = wrap_to_list(chi_sq_exprs)
    n_samples_exprs = wrap_to_list(n_samples_exprs)

    assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or
            (len(n_samples_exprs) == 1))
    __k = 2  # number of covariates, including intercept

    ds = chi_sq_exprs[0]._indices.source

    analyze('ld_score_regression/weight_expr',
            weight_expr,
            ds._row_indices)
    analyze('ld_score_regression/ld_score_expr',
            ld_score_expr,
            ds._row_indices)

    # format input dataset
    if isinstance(ds, MatrixTable):
        if len(chi_sq_exprs) != 1:
            raise ValueError("""Only one chi_sq_expr allowed if originating
                from a matrix table.""")
        if len(n_samples_exprs) != 1:
            raise ValueError("""Only one n_samples_expr allowed if
                originating from a matrix table.""")

        col_key = list(ds.col_key)
        if len(col_key) != 1:
            raise ValueError("""Matrix table must be keyed by a single
                phenotype field.""")

        analyze('ld_score_regression/chi_squared_expr',
                chi_sq_exprs[0],
                ds._entry_indices)
        analyze('ld_score_regression/n_samples_expr',
                n_samples_exprs[0],
                ds._entry_indices)

        ds = ds._select_all(row_exprs={'__locus': ds['locus'],
                                       '__alleles': ds['alleles'],
                                       '__w_initial': weight_expr,
                                       '__w_initial_floor': hl.max(weight_expr,
                                                                   1.0),
                                       '__x': ld_score_expr,
                                       '__x_floor': hl.max(ld_score_expr,
                                                           1.0)},
                            row_key=['__locus', '__alleles'],
                            col_exprs={'__y_name': ds[col_key[0]]},
                            col_key=['__y_name'],
                            entry_exprs={'__y': chi_sq_exprs[0],
                                         '__n': n_samples_exprs[0]})
        ds = ds.annotate_entries(**{'__w': ds['__w_initial']})

        ds = ds.filter_rows(hl.is_defined(ds['__locus']) &
                            hl.is_defined(ds['__alleles']) &
                            hl.is_defined(ds['__w_initial']) &
                            hl.is_defined(ds['__x']))

    else:
        assert isinstance(ds, Table)
        for y in chi_sq_exprs:
            analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices)
        for n in n_samples_exprs:
            analyze('ld_score_regression/n_samples_expr', n, ds._row_indices)

        ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)]

        ds = ds.select(**dict(**{'__locus': ds['locus'],
                                 '__alleles': ds['alleles'],
                                 '__w_initial': weight_expr,
                                 '__x': ld_score_expr},
                              **{y: chi_sq_exprs[i]
                                 for i, y in enumerate(ys)},
                              **{w: weight_expr for w in ws},
                              **{n: n_samples_exprs[i]
                                 for i, n in enumerate(ns)}))
        ds = ds.key_by(ds['__locus'], ds['__alleles'])

        table_tmp_file = new_temp_file()
        ds.write(table_tmp_file)
        ds = hl.read_table(table_tmp_file)

        hts = [ds.select(**{'__w_initial': ds['__w_initial'],
                            '__w_initial_floor': hl.max(ds['__w_initial'],
                                                        1.0),
                            '__x': ds['__x'],
                            '__x_floor': hl.max(ds['__x'], 1.0),
                            '__y_name': i,
                            '__y': ds[ys[i]],
                            '__w': ds[ws[i]],
                            '__n': hl.int(ds[ns[i]])})
               for i, y in enumerate(ys)]

        mts = [ht.to_matrix_table(row_key=['__locus',
                                           '__alleles'],
                                  col_key=['__y_name'],
                                  row_fields=['__w_initial',
                                              '__w_initial_floor',
                                              '__x',
                                              '__x_floor'])
               for ht in hts]

        ds = mts[0]
        for i in range(1, len(ys)):
            ds = ds.union_cols(mts[i])

        ds = ds.filter_rows(hl.is_defined(ds['__locus']) &
                            hl.is_defined(ds['__alleles']) &
                            hl.is_defined(ds['__w_initial']) &
                            hl.is_defined(ds['__x']))

    ds_tmp_file = new_temp_file()
    ds.write(ds_tmp_file)
    mt = hl.read_matrix_table(ds_tmp_file)

    if not n_reference_panel_variants:
        M = mt.count_rows()
    else:
        M = n_reference_panel_variants

    # block variants for each phenotype
    n_phenotypes = mt.count_cols()

    mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt['__y']) &
                                         (mt['__y'] < two_step_threshold)),
                             __in_step2=hl.is_defined(mt['__y']))

    mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()),
                          __m_step1=hl.agg.count_where(mt['__in_step1']),
                          __m_step2=hl.agg.count_where(mt['__in_step2']))

    mt = mt.annotate_rows(__step1_array=hl.agg.collect(
                                            hl.struct(
                                                __col_idx=mt['__col_idx'],
                                                __in_step1=mt['__in_step1'])),
                          __step2_array=hl.agg.collect(
                                            hl.struct(
                                                __col_idx=mt['__col_idx'],
                                                __in_step2=mt['__in_step2'])))

    mt = mt.annotate_rows(
        __step1_idx_array=[
            hl.struct(
                __col_idx=mt['__step1_array'][i]['__col_idx'],
                __step1_idx=hl.scan.count_where(
                    mt['__step1_array'][i]['__in_step1']))
            for i in range(n_phenotypes)],
        __step2_idx_array=[
            hl.struct(
                __col_idx=mt['__step2_array'][i]['__col_idx'],
                __step2_idx=hl.scan.count_where(
                    mt['__step2_array'][i]['__in_step2']))
            for i in range(n_phenotypes)])

    mt = mt.annotate_entries(
        __step1_idx=hl.filter(lambda x: x['__col_idx'] == mt['__col_idx'],
                              mt['__step1_idx_array'])[0]['__step1_idx'],
        __step2_idx=hl.filter(lambda x: x['__col_idx'] == mt['__col_idx'],
                              mt['__step2_idx_array'])[0]['__step2_idx'])

    mt = mt.annotate_cols(__step2_maplist=hl.sorted(
        hl.agg.filter(mt['__in_step1'], hl.agg.collect(mt['__step2_idx']))))

    mt_tmp_file = new_temp_file()
    mt.write(mt_tmp_file)
    mt = hl.read_matrix_table(mt_tmp_file)

    step1_dict = {x['__col_idx']: x['__m_step1'] for x in mt.cols().collect()}
    step2_dict = {x['__col_idx']: (x['__m_step2'],
                                   x['__step2_maplist'])
                  for x in mt.cols().collect()}

    step1_separators = {}
    for k, v in step1_dict.items():
        s = np.floor(np.linspace(0, v, n_blocks + 1))
        step1_separators[k] = [int(x) for x in s]

    step2_separators = {}
    for k, v in step2_dict.items():
        s = [0]
        s.extend([v[1][x] for x in step1_separators[k][1:-1]])
        s.append(v[0])
        step2_separators[k] = [int(x) for x in s]

    mt = mt.annotate_cols(
        __step1_separators=hl.literal(step1_separators)[mt['__col_idx']],
        __step2_separators=hl.literal(step2_separators)[mt['__col_idx']])

    mt = mt.annotate_entries(
        __step2_block=hl.sum(hl.map(lambda x: mt['__step2_idx'] >= x,
                                    mt['__step1_separators']))
    )
    mt = mt.annotate_entries(
        __step1_block=hl.sum(hl.map(lambda x: mt['__step1_idx'] >= x,
                                    mt['__step1_separators'])) - 1,
        __step2_block=hl.sum(hl.map(lambda x: mt['__step2_idx'] >= x,
                                    mt['__step2_separators'])) - 1)

    # initial coefficient estimates
    mt = mt.annotate_cols(__initial_betas=[
        1.0, (hl.agg.mean(mt['__y']) - 1.0) / hl.agg.mean(mt['__x'])])
    mt = mt.annotate_cols(__step1_betas=mt['__initial_betas'],
                          __step2_betas=mt['__initial_betas'])

    # step 1 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt['__in_step1'],
            1.0/(mt['__w_initial_floor'] * 2.0 * (mt['__step1_betas'][0] +
                                                  mt['__step1_betas'][1] *
                                                  mt['__x_floor'])**2),
            0.0))
        mt = mt.annotate_cols(__step1_betas=hl.agg.filter(
            mt['__in_step1'],
            hl.agg.linreg(y=mt['__y'],
                          x=[1.0, mt['__x']],
                          weight=mt['__w']).beta))
        mt = mt.annotate_cols(__step1_h2=hl.max(hl.min(
            mt['__step1_betas'][1] * M / hl.agg.mean(mt['__n']), 1.0), 0.0))
        mt = mt.annotate_cols(__step1_betas=[
            mt['__step1_betas'][0],
            mt['__step1_h2'] * hl.agg.mean(mt['__n']) / M])

    # step 1 block jackknife
    mt = mt.annotate_cols(__step1_block_betas=[
        hl.agg.filter((mt['__step1_block'] != i) & mt['__in_step1'],
                      hl.agg.linreg(y=mt['__y'],
                                    x=[1.0, mt['__x']],
                                    weight=mt['__w']).beta)
        for i in range(n_blocks)])

    mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt['__step1_betas'] - (n_blocks - 1) * x,
        mt['__step1_block_betas']))

    mt = mt.annotate_cols(
        __step1_jackknife_mean=hl.map(
            lambda i: hl.mean(
                hl.map(lambda x: x[i],
                       mt['__step1_block_betas_bias_corrected'])),
            hl.range(0, __k)),
        __step1_jackknife_variance=hl.map(
            lambda i: (hl.sum(
                hl.map(lambda x: x[i]**2,
                       mt['__step1_block_betas_bias_corrected'])) -
                       hl.sum(
                hl.map(lambda x: x[i],
                       mt['__step1_block_betas_bias_corrected']))**2 /
                       n_blocks) /
            (n_blocks - 1) / n_blocks,
            hl.range(0, __k)))

    # step 2 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt['__in_step2'],
            1.0/(mt['__w_initial_floor'] *
                 2.0 * (mt['__step2_betas'][0] +
                        mt['__step2_betas'][1] *
                        mt['__x_floor'])**2),
            0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt['__step1_betas'][0],
            hl.agg.filter(mt['__in_step2'],
                          hl.agg.linreg(y=mt['__y'] - mt['__step1_betas'][0],
                                        x=[mt['__x']],
                                        weight=mt['__w']).beta[0])])
        mt = mt.annotate_cols(__step2_h2=hl.max(hl.min(
            mt['__step2_betas'][1] * M/hl.agg.mean(mt['__n']), 1.0), 0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt['__step1_betas'][0],
            mt['__step2_h2'] * hl.agg.mean(mt['__n'])/M])

    # step 2 block jackknife
    mt = mt.annotate_cols(__step2_block_betas=[
        hl.agg.filter((mt['__step2_block'] != i) & mt['__in_step2'],
                      hl.agg.linreg(y=mt['__y'] - mt['__step1_betas'][0],
                                    x=[mt['__x']],
                                    weight=mt['__w']).beta[0])
        for i in range(n_blocks)])

    mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt['__step2_betas'][1] - (n_blocks - 1) * x,
        mt['__step2_block_betas']))

    mt = mt.annotate_cols(
        __step2_jackknife_mean=hl.mean(
            mt['__step2_block_betas_bias_corrected']),
        __step2_jackknife_variance=(
            hl.sum(mt['__step2_block_betas_bias_corrected']**2) -
            hl.sum(mt['__step2_block_betas_bias_corrected'])**2 /
            n_blocks) / (n_blocks - 1) / n_blocks)

    # combine step 1 and step 2 block jackknifes
    mt = mt.annotate_entries(
        __step2_initial_w=1.0/(mt['__w_initial_floor'] *
                               2.0 * (mt['__initial_betas'][0] +
                                      mt['__initial_betas'][1] *
                                      mt['__x_floor'])**2))

    mt = mt.annotate_cols(
        __final_betas=[
            mt['__step1_betas'][0],
            mt['__step2_betas'][1]],
        __c=(hl.agg.sum(mt['__step2_initial_w'] * mt['__x']) /
             hl.agg.sum(mt['__step2_initial_w'] * mt['__x']**2)))

    mt = mt.annotate_cols(__final_block_betas=hl.map(
        lambda i: (mt['__step2_block_betas'][i] - mt['__c'] *
                   (mt['__step1_block_betas'][i][0] - mt['__final_betas'][0])),
        hl.range(0, n_blocks)))

    mt = mt.annotate_cols(
        __final_block_betas_bias_corrected=(n_blocks * mt['__final_betas'][1] -
                                            (n_blocks - 1) *
                                            mt['__final_block_betas']))

    mt = mt.annotate_cols(
        __final_jackknife_mean=[
            mt['__step1_jackknife_mean'][0],
            hl.mean(mt['__final_block_betas_bias_corrected'])],
        __final_jackknife_variance=[
            mt['__step1_jackknife_variance'][0],
            (hl.sum(mt['__final_block_betas_bias_corrected']**2) -
             hl.sum(mt['__final_block_betas_bias_corrected'])**2 /
             n_blocks) / (n_blocks - 1) / n_blocks])

    # convert coefficient to heritability estimate
    mt = mt.annotate_cols(
        phenotype=mt['__y_name'],
        mean_chi_sq=hl.agg.mean(mt['__y']),
        intercept=hl.struct(
            estimate=mt['__final_betas'][0],
            standard_error=hl.sqrt(mt['__final_jackknife_variance'][0])),
        snp_heritability=hl.struct(
            estimate=(M/hl.agg.mean(mt['__n'])) * mt['__final_betas'][1],
            standard_error=hl.sqrt((M/hl.agg.mean(mt['__n']))**2 *
                                   mt['__final_jackknife_variance'][1])))

    # format and return results
    ht = mt.cols()
    ht = ht.key_by(ht['phenotype'])
    ht = ht.select(ht['mean_chi_sq'],
                   ht['intercept'],
                   ht['snp_heritability'])

    ht_tmp_file = new_temp_file()
    ht.write(ht_tmp_file)
    ht = hl.read_table(ht_tmp_file)

    return ht
示例#18
0
def main(args):
    hl.init(default_reference=args.default_ref_genome)

    if args.run_test_mode:
        logger.info('Running pipeline on test data...')
        mt = (get_mt_data(part='raw_chr20').sample_rows(0.1))
    else:
        logger.info(
            'Running pipeline on MatrixTable wih adjusted genotypes...')
        ds = args.exome_cohort
        mt = hl.read_matrix_table(
            get_qc_mt_path(dataset=ds,
                           part='unphase_adj_genotypes',
                           split=True))

    # 1. Sample-QC filtering
    if not args.skip_sample_qc_filtering:
        logger.info('Applying per sample QC filtering...')

        mt = apply_sample_qc_filtering(mt)

        logger.info(
            'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt',
                       overwrite=True))

    # 2. Variant-QC filtering
    if not args.skip_variant_qc_filtering:

        logger.info('Applying per variant QC filtering...')

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'):
            logger.info('Reading pre-existing sample qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt')
        mt = apply_variant_qc_filtering(mt)

        # write hard filtered MT to disk
        logger.info(
            'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt',
                       overwrite=True))

    # 3. Annotate AFs

    # allelic frequency cut-off
    maf_cutoff = args.af_max_threshold

    if not args.skip_af_filtering:

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'):
            logger.info(
                'Reading pre-existing sample/variant qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt')

        # Annotate allelic frequencies from external source,
        # and compute internal AF on samples passing QC
        af_ht = get_af_annotation_ht()

        mt = (mt.annotate_rows(**af_ht[mt.row_key]))

        filter_expressions = [
            af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff)
        ]

        mt = (mt.filter_rows(functools.reduce(operator.iand,
                                              filter_expressions),
                             keep=True))

        logger.info(
            f'Writing sample/variant QCed MT with rare variants at maf: {args.af_max_threshold}.'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt',
                       overwrite=True))

    # 4. ##### Run gene-set burden logistic regression ######

    logger.info('Running gene-set burden logistic regression test...')

    if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'):
        logger.info(
            'Reading pre-existing sample/variant qc-filtered MT with rare variants...'
        )
        mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt')

    ## Add VEP-annotated fields
    vep_ht = get_vep_annotation_ht()

    mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF,
                           Consequence=vep_ht[mt.row_key].vep.Consequence,
                           DOMAINS=vep_ht[mt.row_key].vep.DOMAINS,
                           SYMBOL=vep_ht[mt.row_key].vep.SYMBOL))

    ## Filter to bi-allelic variants
    if args.filter_biallelic:
        logger.info('Running burden test on biallelic variants...')
        mt = mt.filter_rows(bi_allelic_expr(mt))

    ## Filter to variants within protein domain(s)
    if args.filter_protein_domain:
        logger.info(
            'Running burden test on variants within protein domain(s)...')
        mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS),
                            keep=True)

    ## Annotate pathogenic scores
    ht_scores = get_vep_scores_ht()
    mt = mt.annotate_rows(**ht_scores[mt.row_key])

    ## Classify variant into (major) consequence groups
    score_expr_ann = {
        'hcLOF': mt.LoF == 'HC',
        'syn': mt.Consequence == 'synonymous_variant',
        'miss': mt.Consequence == 'missense_variant'
    }

    # Update dict expr annotations with combinations of variant consequences categories
    score_expr_ann.update({
        'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD),
                          (mt['vep.REVEL_score'] >= REVEL_THRESHOLD),
                          (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2)
        & score_expr_ann.get('miss')
    })

    score_expr_ann.update({
        'hcLOF_missC':
        score_expr_ann.get('hcLOF') | score_expr_ann.get('missC')
    })

    mt = (mt.annotate_rows(csq_group=score_expr_ann))

    # Transmute csq_group and convert dict to set where the group is defined
    # (easier to explode and grouping later)
    mt = (mt.transmute_rows(csq_group=hl.set(
        hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys()))))

    mt = (mt.filter_rows(hl.len(mt.csq_group) > 0))

    # Explode nested csq_group and gene clusters before grouping
    mt = (mt.explode_rows(mt.csq_group))

    # First-step aggregation:
    # Generate a sample per gene/variant_type (binary) matrix aggregating genotypes as follow:
    #
    #   a) entry: hets
    #   b) entry: homs
    #   c) entry: chets (compound hets)

    mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate(
        hets=hl.agg.any(mt.GT.is_het()),
        homs=hl.agg.any(mt.GT.is_hom_var()),
        chets=hl.agg.count_where(
            mt.GT.is_het()) >= 2).repartition(100).persist())

    # Import/generate gene clusters
    clusters = hl.import_table(args.set_file,
                               no_header=True,
                               delimiter="\t",
                               min_partitions=50,
                               impute=False)
    clusters = generate_clusters_map(clusters)

    # Annotate gene-set info
    mt_grouped = (mt_grouped.annotate_rows(**clusters[mt_grouped.SYMBOL]))

    # Explode nested csq_group before grouping
    mt_grouped = (mt_grouped.explode_rows(mt_grouped.cluster_id))

    # filter rows with defined consequence and gene-set name
    mt_grouped = (mt_grouped.filter_rows(
        hl.is_defined(mt_grouped.csq_group)
        & hl.is_defined(mt_grouped.cluster_id)))

    # 2. Second-step aggregation
    # Generate a sample per gene-sets/variant type matrix aggregating genotypes as follow:
    # if dominant -> sum hets (default)
    # if recessive -> sum (homs)
    # if recessive (a) -> sum (chets)
    # if recessive (b) -> sum (chets and/or homs)

    mts = []

    if args.homs:
        # Group mt by gene-sets/csq_group aggregating homs genotypes.
        mt_homs = (mt_grouped.group_rows_by(
            mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(
                mac=hl.int(hl.agg.sum(mt_grouped.homs))).repartition(
                    100).persist().annotate_rows(agg_genotype='homs'))

        mts.append(mt_homs)

    if args.chets:
        # Group mt by gene-sets/csq_group aggregating compound hets (chets) genotypes.
        mt_chets = (mt_grouped.group_rows_by(
            mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(
                mac=hl.int(hl.agg.sum(mt_grouped.chets))).repartition(
                    100).persist().annotate_rows(agg_genotype='chets'))

        mts.append(mt_chets)

    if args.homs_chets:
        # Group mt by gene-sets/csq_group aggregating chets and/or homs genotypes.
        mt_homs_chets = (mt_grouped.group_rows_by(
            mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(mac=hl.int(
                hl.agg.count_where(mt_grouped.chets
                                   | mt_grouped.homs))).repartition(100).
                         persist().annotate_rows(agg_genotype='homs_chets'))

        mts.append(mt_homs_chets)

    if args.hets:
        # Group mt by gene-sets/csq_group aggregating hets genotypes (default)
        mt_hets = (mt_grouped.group_rows_by(
            mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(
                mac=hl.int(hl.agg.sum(mt_grouped.hets))).repartition(
                    100).persist().annotate_rows(agg_genotype='hets'))

        mts.append(mt_hets)

    ## Joint MatrixTables
    mt_joint = hl.MatrixTable.union_rows(*mts)

    ## Add samples annotations
    # annotate sample covs
    covariates = hl.read_table(
        f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.sample_covariates.ht')
    mt_joint = (mt_joint.annotate_cols(**covariates[mt_joint.s]))

    # annotate case/control phenotype info
    tb_sample = get_sample_meta_data()
    mt_joint = (mt_joint.annotate_cols(**tb_sample[mt_joint.s]))

    mt_joint = (mt_joint.filter_cols(mt_joint['phe.is_case']
                                     | mt_joint['phe.is_control']))

    ## Run logistic regression stratified by proband type
    analysis = ['all_cases', 'syndromic', 'nonsyndromic']

    tbs = []

    covs = ['sex', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']

    for proband in analysis:
        logger.info(f'Running burden test for {proband}...')

        mt_tmp = hl.MatrixTable

        if proband == 'all_cases':
            mt_tmp = mt_joint
        if proband == 'syndromic':
            mt_tmp = mt_joint.filter_cols(~mt_joint['phe.is_nonsyndromic'])
        if proband == 'nonsyndromic':
            mt_tmp = mt_joint.filter_cols(~mt_joint['phe.is_syndromic'])

        tb_logreg = logistic_regression(mt=mt_tmp,
                                        x_expr='mac',
                                        response='phe.is_case',
                                        covs=covs,
                                        pass_through=['agg_genotype'],
                                        extra_fields={
                                            'analysis': proband,
                                            'maf': maf_cutoff,
                                            'covs': '|'.join(covs)
                                        })

        tbs.append(tb_logreg)

    tb_final = hl.Table.union(*tbs)

    # export results
    date = current_date()
    run_hash = str(uuid.uuid4())[:6]
    output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.logreg_burden.{run_hash}.ht'

    tb_final = (tb_final.checkpoint(output=output_path))

    if args.write_to_file:
        # write table to disk as TSV file
        (tb_final.export(f'{output_path}.tsv'))

    hl.stop()