Python any 예제들, hail.any Python 예제들

예제 #1

0

파일 보기

파일: split_multi.py 프로젝트: tpoterba/gnomad_qc

def main(args):
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
    mt = mt.annotate_entries(
        gvcf_info=mt.gvcf_info.drop('ClippingRankSum', 'ReadPosRankSum'))
    mt = mt.annotate_rows(
        n_unsplit_alleles=hl.len(mt.alleles),
        mixed_site=(hl.len(mt.alleles) > 2)
        & hl.any(lambda a: hl.is_indel(mt.alleles[0], a), mt.alleles[1:])
        & hl.any(lambda a: hl.is_snp(mt.alleles[0], a), mt.alleles[1:]))
    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)
    mt.write(args.split_mt_location, overwrite=args.overwrite)

예제 #2

0

파일 보기

파일: import_gtf.py 프로젝트: MikeyRupert/hail

def get_gene_intervals(gene_symbols=None, gene_ids=None, transcript_ids=None,
                       verbose=True, reference_genome=None, gtf_file=None):
    """Get intervals of genes or transcripts.

    Get the boundaries of genes or transcripts from a GTF file, for quick filtering of a Table or MatrixTable.

    On Google Cloud platform:
    Gencode v19 (GRCh37) GTF available at: gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz
    Gencode v29 (GRCh38) GTF available at: gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz

    Example
    -------
    >>> hl.filter_intervals(ht, get_gene_intervals(gene_symbols=['PCSK9'], reference_genome='GRCh37'))  # doctest: +SKIP

    Parameters
    ----------

    gene_symbols : :obj:`list` of :obj:`str`, optional
       Gene symbols (e.g. PCSK9).
    gene_ids : :obj:`list` of :obj:`str`, optional
       Gene IDs (e.g. ENSG00000223972).
    transcript_ids : :obj:`list` of :obj:`str`, optional
       Transcript IDs (e.g. ENSG00000223972).
    verbose : :obj:`bool`
       If ``True``, print which genes and transcripts were matched in the GTF file.
    reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional
       Reference genome to use (passed along to import_gtf).
    gtf_file : :obj:`str`
       GTF file to load. If none is provided, but `reference_genome` is one of
       `GRCh37` or `GRCh38`, a default will be used (on Google Cloud Platform).

    Returns
    -------
    :obj:`list` of :class:`.Interval`
    """
    if gene_symbols is None and gene_ids is None and transcript_ids is None:
        raise ValueError('get_gene_intervals requires at least one of gene_symbols, gene_ids, or transcript_ids')
    ht = _load_gencode_gtf(gtf_file, reference_genome)
    criteria = []
    if gene_symbols:
        criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_name == y), gene_symbols))
    if gene_ids:
        criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_id == y.split('\\.')[0]), gene_ids))
    if transcript_ids:
        criteria.append(hl.any(lambda y: (ht.feature == 'transcript') & (ht.transcript_id == y.split('\\.')[0]), transcript_ids))

    ht = ht.filter(functools.reduce(operator.ior, criteria))
    gene_info = ht.aggregate(hl.agg.collect((ht.feature, ht.gene_name, ht.gene_id, ht.transcript_id, ht.interval)))
    if verbose:
        info(f'get_gene_intervals found {len(gene_info)} entries:\n'
             + "\n".join(map(lambda x: f'{x[0]}: {x[1]} ({x[2] if x[0] == "gene" else x[3]})', gene_info)))
    intervals = list(map(lambda x: x[-1], gene_info))
    return intervals

예제 #3

0

파일 보기

파일: basics.py 프로젝트: broadinstitute/gnomad_qc

def get_gnomad_v3_mt(
    split=False,
    key_by_locus_and_alleles: bool = False,
    remove_hard_filtered_samples: bool = True,
    release_only: bool = False,
    samples_meta: bool = False,
) -> hl.MatrixTable:
    """
    Wrapper function to get gnomAD data with desired filtering and metadata annotations

    :param split: Perform split on MT - Note: this will perform a split on the MT rather than grab an already split MT
    :param key_by_locus_and_alleles: Whether to key the MatrixTable by locus and alleles (only needed for v3)
    :param remove_hard_filtered_samples: Whether to remove samples that failed hard filters (only relevant after sample QC)
    :param release_only: Whether to filter the MT to only samples available for release (can only be used if metadata is present)
    :param samples_meta: Whether to add metadata to MT in 'meta' column
    :return: gnomAD v3 dataset with chosen annotations and filters
    """
    mt = gnomad_v3_genotypes.mt()
    if key_by_locus_and_alleles:
        mt = hl.MatrixTable(
            hl.ir.MatrixKeyRowsBy(
                mt._mir, ["locus", "alleles"], is_sorted=True
            )  # Prevents hail from running sort on genotype MT which is already sorted by a unique locus
        )

    if remove_hard_filtered_samples:
        mt = mt.filter_cols(
            hl.is_missing(hard_filtered_samples.ht()[mt.col_key]))

    if samples_meta:
        mt = mt.annotate_cols(meta=meta.ht()[mt.col_key])

        if release_only:
            mt = mt.filter_cols(mt.meta.release)

    elif release_only:
        mt = mt.filter_cols(meta.ht()[mt.col_key].release)

    if split:
        mt = mt.annotate_rows(
            n_unsplit_alleles=hl.len(mt.alleles),
            mixed_site=(hl.len(mt.alleles) > 2)
            & hl.any(lambda a: hl.is_indel(mt.alleles[0], a), mt.alleles[1:])
            & hl.any(lambda a: hl.is_snp(mt.alleles[0], a), mt.alleles[1:]),
        )
        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

    return mt

예제 #4

0

파일 보기

파일: 1a.family_stats_hail_de_novo.py 프로젝트: wtsi-hgi/exomeQC-hail-gnomad

def generate_allele_data(mt: hl.MatrixTable) -> hl.Table:
    """
    Writes bi-allelic sites MT with the following annotations:
     - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles)

    :param MatrixTable mt: Full unsplit MT
    :return: Table with allele data annotations
    :rtype: Table
    """
    ht = mt.rows().select()
    allele_data = hl.struct(nonsplit_alleles=ht.alleles,
                            has_star=hl.any(lambda a: a == '*', ht.alleles))
    ht = ht.annotate(allele_data=allele_data.annotate(
        **add_variant_type(ht.alleles)))

    ht = hl.split_multi_hts(ht)
    allele_type = (hl.case().when(
        hl.is_snp(ht.alleles[0], ht.alleles[1]),
        'snv').when(hl.is_insertion(ht.alleles[0], ht.alleles[1]),
                    'ins').when(hl.is_deletion(ht.alleles[0], ht.alleles[1]),
                                'del').default('complex'))
    ht = ht.annotate(allele_data=ht.allele_data.annotate(
        allele_type=allele_type,
        was_mixed=ht.allele_data.variant_type == 'mixed'))
    return ht

예제 #5

0

파일 보기

파일: genotypes.py 프로젝트: nikbaya/diverse_pops

def get_filtered_mt(chrom: str = 'all',
                    pop: str = 'all',
                    imputed: bool = True,
                    min_mac: int = 20,
                    entry_fields=('GP', )):
    if imputed:
        ht = hl.read_table(get_ukb_af_ht_path())
        if pop == 'all':
            ht = ht.filter(
                hl.any(lambda x: ht.af[x] * ht.an[x] >= min_mac,
                       hl.literal(POPS)))
        else:
            ht = ht.filter(ht.af[pop] * ht.an[pop] >= min_mac)
        mt = get_ukb_imputed_data(chrom,
                                  variant_list=ht,
                                  entry_fields=entry_fields)
    else:
        mt = hl.read_matrix_table('gs://ukb31063/ukb31063.genotype.mt')

    covariates_ht = get_covariates()
    hq_samples_ht = get_hq_samples()
    # TODO: confirm that this is correct set
    mt = mt.annotate_cols(**covariates_ht[mt.s])
    mt = mt.filter_cols(
        hl.is_defined(mt.pop) & hl.is_defined(hq_samples_ht[mt.s]))

    if pop != 'all': mt = mt.filter_cols(mt.pop == pop)
    return mt

예제 #6

0

파일 보기

def generate_allele_data(ht: hl.Table) -> hl.Table:
    """
    Returns bi-allelic sites HT with the following annotations:
     - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles)

    :param Table ht: Full unsplit HT
    :return: Table with allele data annotations
    :rtype: Table
    """
    ht = ht.select()
    allele_data = hl.struct(nonsplit_alleles=ht.alleles,
                            has_star=hl.any(lambda a: a == "*", ht.alleles))
    ht = ht.annotate(allele_data=allele_data.annotate(
        **add_variant_type(ht.alleles)))

    ht = hl.split_multi_hts(ht)
    ht = ht.filter(hl.len(ht.alleles) > 1)
    allele_type = (hl.case().when(
        hl.is_snp(ht.alleles[0], ht.alleles[1]),
        "snv").when(hl.is_insertion(ht.alleles[0], ht.alleles[1]),
                    "ins").when(hl.is_deletion(ht.alleles[0], ht.alleles[1]),
                                "del").default("complex"))
    ht = ht.annotate(allele_data=ht.allele_data.annotate(
        allele_type=allele_type,
        was_mixed=ht.allele_data.variant_type == "mixed"))
    return ht

예제 #7

0

파일 보기

파일: phenotype_munging.py 프로젝트: Nealelab/ukb_common

def conditional_phenotypes(mt: hl.MatrixTable,
                           column_field,
                           entry_field,
                           lists_of_columns,
                           new_col_name='grouping',
                           new_entry_name='new_entry'):
    """
    Create a conditional phenotype by setting phenotype1 to missing for any individual without phenotype2.

    Pheno1 Pheno2 new_pheno
    T      T      T
    T      F      NA
    F      F      NA
    F      T      F

    `lists_of_columns` should be a list of lists (of length 2 for the inner list).
    The first element corresponds to the phenotype to maintain, except for setting to missing when the
    phenotype coded by the second element is False.

    new_entry = Pheno1 conditioned on having Pheno2

    Example:

    mt = hl.balding_nichols_model(1, 3, 10).drop('GT')
    mt = mt.annotate_entries(pheno=hl.rand_bool(0.5))
    lists_of_columns = [[0, 1], [2, 1]]
    entry_field = mt.pheno
    column_field = mt.sample_idx

    :param MatrixTable mt: Input MatrixTable
    :param Expression column_field: Column-indexed Expression to group by
    :param Expression entry_field: Entry-indexed Expression to which to apply `grouping_function`
    :param list of list lists_of_columns: Entry in this list should be the same type as `column_field`
    :param str new_col_name: Name for new column key (default 'grouping')
    :param str new_entry_name: Name for new entry expression (default 'new_entry')
    :return: Re-grouped MatrixTable
    :rtype: MatrixTable
    """
    assert all([len(x) == 2 for x in lists_of_columns])
    lists_of_columns = hl.literal(lists_of_columns)
    mt = mt._annotate_all(col_exprs={'_col_expr': column_field},
                          entry_exprs={'_entry_expr': entry_field})
    mt = mt.annotate_cols(
        _col_expr=lists_of_columns.filter(lambda x: x.contains(
            mt._col_expr)).map(lambda y: (y, y[0] == mt._col_expr)))
    mt = mt.explode_cols('_col_expr')
    # if second element (~mt._col_expr[1]) is false (~mt._entry_expr), then return missing
    # otherwise, get actual element (either true if second element, or actual first element)
    bool_array = hl.agg.collect(
        hl.if_else(~mt._col_expr[1] & ~mt._entry_expr, hl.null(hl.tbool),
                   mt._entry_expr))
    # if any element is missing, return missing. otherwise return first element
    return mt.group_cols_by(**{
        new_col_name: mt._col_expr[0]
    }).aggregate(
        **{
            new_entry_name:
            hl.if_else(hl.any(lambda x: hl.is_missing(x), bool_array),
                       hl.null(hl.tbool), bool_array[0] & bool_array[1])
        })

예제 #8

0

파일 보기

파일: generate_split_alleles.py 프로젝트: leklab/pcgc_hail

def generate_split_alleles(mt: hl.MatrixTable) -> hl.Table:

    allele_data = hl.struct(nonsplit_alleles=mt.alleles,
                            has_star=hl.any(lambda a: a == '*', mt.alleles))

    mt = mt.annotate_rows(allele_data=allele_data.annotate(
        **add_variant_type(mt.alleles)))
    mt = hl.split_multi_hts(mt, left_aligned=True)

    allele_type = (hl.case().when(
        hl.is_snp(mt.alleles[0], mt.alleles[1]),
        'snv').when(hl.is_insertion(mt.alleles[0], mt.alleles[1]),
                    'ins').when(hl.is_deletion(mt.alleles[0], mt.alleles[1]),
                                'del').default('complex'))
    mt = mt.annotate_rows(allele_data=mt.allele_data.annotate(
        allele_type=allele_type,
        was_mixed=mt.allele_data.variant_type == 'mixed'))
    return mt

예제 #9

0

파일 보기

파일: genotypes.py 프로젝트: SilhouetteQ/ukbb_pan_ancestry

def get_filtered_mt(pop: str = 'all',
                    imputed: bool = True,
                    chrom: str = 'all',
                    min_mac: int = 20):
    if imputed:
        ht = hl.read_table(ukb_af_ht_path)
        if pop == 'all':
            ht = ht.filter(
                hl.any(lambda x: ht.af[x] * ht.an[x] >= min_mac,
                       hl.literal(POPS)))
        else:
            ht = ht.filter(ht.af[pop] * ht.an[pop] >= min_mac)
        mt = get_ukb_imputed_data(chrom, variant_list=ht)
    else:
        mt = hl.read_matrix_table('gs://ukb31063/ukb31063.genotype.mt')
    meta_ht = get_ukb_meta()
    mt = mt.annotate_cols(**meta_ht.key_by(s=hl.str(meta_ht.s))[mt.s])

    if pop != 'all': mt = mt.filter_cols(mt.pop == pop)
    return mt

예제 #10

0

파일 보기

def all_and_leave_one_out(x,
                          pop_array,
                          all_f=hl.sum,
                          loo_f=lambda i, x: hl.sum(x) - hl.or_else(x[i], 0)):
    """
    Applies a function to an input array for all populations, and for each of leave-one-out populations.

    :param x: Input array
    :param pop_array: Population array
    :param all_f: Function for all populations. It takes the input array and returns a new value
    :param loo_f: Function for each of leave-one-out populations. It takes an index of leave-one-out
                  population and the input array, and returns an array of new values.
    ...
    :return: Array of new values for all populations and for each of leave-one-out populations.
    :rtype: ArrayExpression
    """
    arr = hl.array([all_f(x)])
    arr = arr.extend(hl.map(lambda i: loo_f(i, x),
                            hl.range(hl.len(pop_array))))
    return hl.or_missing(hl.any(hl.is_defined, x), arr)

예제 #11

0

파일 보기

def get_r_within_gene(
    bm: BlockMatrix,
    ld_index: hl.Table,
    gene: str,
    vep_ht: hl.Table = None,
    reference_genome: str = None,
):
    """
    Get LD information (`r`) for all pairs of variants within `gene`.

    Warning: this returns a table quadratic in number of variants. Exercise caution with large genes.

    :param bm: Input Block Matrix
    :param ld_index: Corresponding index table
    :param gene: Gene symbol as string
    :param vep_ht: Table with VEP annotations (if None, gets from get_gnomad_public_data())
    :param reference_genome: Reference genome to pass to get_gene_intervals for fast filtering to gene
    :return: Table with pairs of variants
    """
    if vep_ht is None:
        vep_ht = public_release("exomes").ht()
    if reference_genome is None:
        reference_genome = hl.default_reference().name
    intervals = hl.experimental.get_gene_intervals(
        gene_symbols=[gene], reference_genome=reference_genome)
    ld_index = hl.filter_intervals(ld_index, intervals)
    ld_index = ld_index.annotate(vep=vep_ht[ld_index.key].vep)
    ld_index = ld_index.filter(
        hl.any(lambda tc: tc.gene_symbol == gene,
               ld_index.vep.transcript_consequences))

    indices_to_keep = ld_index.idx.collect()
    filt_bm = bm.filter(indices_to_keep, indices_to_keep)
    ht = filt_bm.entries()
    ld_index = ld_index.add_index("new_idx").key_by("new_idx")
    return ht.transmute(r=ht.entry,
                        i_variant=ld_index[ht.i],
                        j_variant=ld_index[ht.j])

예제 #12

0

파일 보기

파일: genotypes.py 프로젝트: wlu04/ukbb_pan_ancestry

def get_filtered_mt(chrom: str = 'all',
                    pop: str = 'all',
                    imputed: bool = True,
                    min_mac: int = 20,
                    entry_fields=('GP', ),
                    filter_mac_instead_of_ac: bool = False):

    # get ac or mac based on filter_mac_instead_of_ac
    def get_ac(af, an):
        if filter_mac_instead_of_ac:
            # Note that the underlying file behind get_ukb_af_ht_path() accidentally double af and halve an
            return (1.0 - hl.abs(1.0 - af)) * an
        else:
            return af * an

    if imputed:
        ht = hl.read_table(get_ukb_af_ht_path())
        if pop == 'all':
            ht = ht.filter(
                hl.any(lambda x: get_ac(ht.af[x], ht.an[x]) >= min_mac,
                       hl.literal(POPS)))
        else:
            ht = ht.filter(get_ac(ht.af[pop], ht.an[pop]) >= min_mac)
        mt = get_ukb_imputed_data(chrom,
                                  variant_list=ht,
                                  entry_fields=entry_fields)
    else:
        mt = hl.read_matrix_table('gs://ukb31063/ukb31063.genotype.mt')

    covariates_ht = get_covariates()
    hq_samples_ht = get_hq_samples()
    mt = mt.annotate_cols(**covariates_ht[mt.s])
    mt = mt.filter_cols(
        hl.is_defined(mt.pop) & hl.is_defined(hq_samples_ht[mt.s]))

    if pop != 'all': mt = mt.filter_cols(mt.pop == pop)
    return mt

예제 #13

0

파일 보기

파일: utils.py 프로젝트: Jinjie-Duan/gnomad_hail

def process_consequences(mt: hl.MatrixTable,
                         vep_root: str = 'vep',
                         penalize_flags: bool = True) -> hl.MatrixTable:
    """
    Adds most_severe_consequence (worst consequence for a transcript) into [vep_root].transcript_consequences,
    and worst_csq_by_gene, any_lof into [vep_root]

    :param MatrixTable mt: Input MT
    :param str vep_root: Root for vep annotation (probably vep)
    :param bool penalize_flags: Whether to penalize LOFTEE flagged variants, or treat them as equal to HC
    :return: MT with better formatted consequences
    :rtype: MatrixTable
    """
    csqs = hl.literal(CSQ_ORDER)
    csq_dict = hl.literal(dict(zip(CSQ_ORDER, range(len(CSQ_ORDER)))))

    def add_most_severe_consequence(
            tc: hl.expr.StructExpression) -> hl.expr.StructExpression:
        """
        Add most_severe_consequence annotation to transcript consequences
        This is for a given transcript, as there are often multiple annotations for a single transcript:
        e.g. splice_region_variant&intron_variant -> splice_region_variant
        """
        return tc.annotate(most_severe_consequence=csqs.find(
            lambda c: tc.consequence_terms.contains(c)))

    def find_worst_transcript_consequence(
            tcl: hl.expr.ArrayExpression) -> hl.expr.StructExpression:
        """
        Gets worst transcript_consequence from an array of em
        """
        flag_score = 500
        no_flag_score = flag_score * (1 + penalize_flags)

        def csq_score(tc):
            return csq_dict[csqs.find(
                lambda x: x == tc.most_severe_consequence)]

        tcl = tcl.map(lambda tc: tc.annotate(
            csq_score=hl.case(missing_false=True).
            when((tc.lof == 'HC') & (tc.lof_flags == ''),
                 csq_score(tc) - no_flag_score).when(
                     (tc.lof == 'HC') & (tc.lof_flags != ''),
                     csq_score(tc) - flag_score).when(tc.lof == 'LC',
                                                      csq_score(tc) - 10).
            when(tc.polyphen_prediction == 'probably_damaging',
                 csq_score(tc) - 0.5).when(
                     tc.polyphen_prediction == 'possibly_damaging',
                     csq_score(tc) - 0.25).when(
                         tc.polyphen_prediction == 'benign',
                         csq_score(tc) - 0.1).default(csq_score(tc))))
        return hl.or_missing(
            hl.len(tcl) > 0,
            hl.sorted(tcl, lambda x: x.csq_score)[0])

    transcript_csqs = mt[vep_root].transcript_consequences.map(
        add_most_severe_consequence)

    gene_dict = transcript_csqs.group_by(lambda tc: tc.gene_symbol)
    worst_csq_gene = gene_dict.map_values(find_worst_transcript_consequence)
    sorted_scores = hl.sorted(worst_csq_gene.values(),
                              key=lambda tc: tc.csq_score)
    lowest_score = hl.or_missing(
        hl.len(sorted_scores) > 0, sorted_scores[0].csq_score)
    gene_with_worst_csq = sorted_scores.filter(
        lambda tc: tc.csq_score == lowest_score).map(lambda tc: tc.gene_symbol)
    ensg_with_worst_csq = sorted_scores.filter(
        lambda tc: tc.csq_score == lowest_score).map(lambda tc: tc.gene_id)

    vep_data = mt[vep_root].annotate(
        transcript_consequences=transcript_csqs,
        worst_csq_by_gene=worst_csq_gene,
        any_lof=hl.any(lambda x: x.lof == 'HC', worst_csq_gene.values()),
        gene_with_most_severe_csq=gene_with_worst_csq,
        ensg_with_most_severe_csq=ensg_with_worst_csq)

    return mt.annotate_rows(**{vep_root: vep_data})

예제 #14

0

파일 보기

def main(args):
    subsets = args.subsets
    hl.init(
        log=
        f"/generate_frequency_data{'.' + '_'.join(subsets) if subsets else ''}.log",
        default_reference="GRCh38",
    )

    invalid_subsets = []
    n_subsets_use_subpops = 0
    for s in subsets:
        if s not in SUBSETS:
            invalid_subsets.append(s)
        if s in COHORTS_WITH_POP_STORED_AS_SUBPOP:
            n_subsets_use_subpops += 1

    if invalid_subsets:
        raise ValueError(
            f"{', '.join(invalid_subsets)} subset(s) are not one of the following official subsets: {SUBSETS}"
        )
    if n_subsets_use_subpops & (n_subsets_use_subpops != len(subsets)):
        raise ValueError(
            f"All or none of the supplied subset(s) should be in the list of cohorts that need to use subpops instead "
            f"of pops in frequency calculations: {COHORTS_WITH_POP_STORED_AS_SUBPOP}"
        )

    try:
        logger.info("Reading full sparse MT and metadata table...")
        mt = get_gnomad_v3_mt(
            key_by_locus_and_alleles=True,
            release_only=not args.include_non_release,
            samples_meta=True,
        )

        if args.test:
            logger.info("Filtering to two partitions on chr20")
            mt = hl.filter_intervals(
                mt, [hl.parse_locus_interval("chr20:1-1000000")])
            mt = mt._filter_partitions(range(2))

        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

        if args.include_non_release:
            logger.info("Filtering MT columns to high quality samples")
            total_sample_count = mt.count_cols()
            mt = mt.filter_cols(mt.meta.high_quality)
            high_quality_sample_count = mt.count_cols()
            logger.info(
                f"Filtered {total_sample_count - high_quality_sample_count} from the full set of {total_sample_count} "
                f"samples...")

        if subsets:
            mt = mt.filter_cols(hl.any([mt.meta.subsets[s] for s in subsets]))
            logger.info(
                f"Running frequency generation pipeline on {mt.count_cols()} samples in {', '.join(subsets)} subset(s)..."
            )
        else:
            logger.info(
                f"Running frequency generation pipeline on {mt.count_cols()} samples..."
            )

        logger.info("Computing adj and sex adjusted genotypes...")
        mt = mt.annotate_entries(
            GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT,
                                        mt.meta.sex_imputation.sex_karyotype),
            adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD),
        )

        logger.info("Densify-ing...")
        mt = hl.experimental.densify(mt)
        mt = mt.filter_rows(hl.len(mt.alleles) > 1)

        # Temporary hotfix for depletion of homozygous alternate genotypes
        logger.info(
            "Setting het genotypes at sites with >1% AF (using v3.0 frequencies) and > 0.9 AB to homalt..."
        )
        # Load v3.0 allele frequencies to avoid an extra frequency calculation
        # NOTE: Using previous callset AF works for small incremental changes to a callset, but we will need to revisit for large increments
        freq_ht = get_freq(version="3").ht()
        freq_ht = freq_ht.select(AF=freq_ht.freq[0].AF)

        mt = mt.annotate_entries(GT=hl.cond(
            (freq_ht[mt.row_key].AF > 0.01)
            & mt.GT.is_het()
            & (mt.AD[1] / mt.DP > 0.9),
            hl.call(1, 1),
            mt.GT,
        ))

        logger.info("Generating frequency data...")
        if subsets:
            mt = annotate_freq(
                mt,
                sex_expr=mt.meta.sex_imputation.sex_karyotype,
                pop_expr=mt.meta.population_inference.pop
                if not n_subsets_use_subpops else
                mt.meta.project_meta.project_subpop,
                # NOTE: TGP and HGDP labeled populations are highly specific and are stored in the project_subpop meta field
            )

            # NOTE: no FAFs or popmax needed for subsets
            mt = mt.select_rows("freq")

            logger.info(
                f"Writing out frequency data for {', '.join(subsets)} subset(s)..."
            )
            if args.test:
                mt.rows().write(
                    get_checkpoint_path(
                        f"chr20_test_freq.{'_'.join(subsets)}"),
                    overwrite=True,
                )
            else:
                mt.rows().write(get_freq(subset="_".join(subsets)).path,
                                overwrite=args.overwrite)

        else:
            logger.info("Computing age histograms for each variant...")
            mt = mt.annotate_cols(age=hl.if_else(
                hl.is_defined(mt.meta.project_meta.age),
                mt.meta.project_meta.age,
                mt.meta.project_meta.age_alt,
                # NOTE: most age data is stored as integers in 'age' annotation, but for a select number of samples, age is stored as a bin range and 'age_alt' corresponds to an integer in the middle of the bin
            ))
            mt = mt.annotate_rows(**age_hists_expr(mt.adj, mt.GT, mt.age))

            # Compute callset-wide age histogram global
            mt = mt.annotate_globals(age_distribution=mt.aggregate_cols(
                hl.agg.hist(mt.age, 30, 80, 10)))

            mt = annotate_freq(
                mt,
                sex_expr=mt.meta.sex_imputation.sex_karyotype,
                pop_expr=mt.meta.population_inference.pop,
                downsamplings=DOWNSAMPLINGS,
            )
            # Remove all loci with raw AC=0
            mt = mt.filter_rows(mt.freq[1].AC > 0)

            logger.info("Calculating InbreedingCoeff...")
            # NOTE: This is not the ideal location to calculate this, but added here to avoid another densify
            mt = mt.annotate_rows(
                InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

            logger.info("Computing filtering allele frequencies and popmax...")
            faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus,
                                     POPS_TO_REMOVE_FOR_POPMAX)
            mt = mt.select_rows(
                "InbreedingCoeff",
                "freq",
                faf=faf,
                popmax=pop_max_expr(mt.freq, mt.freq_meta,
                                    POPS_TO_REMOVE_FOR_POPMAX),
            )
            mt = mt.annotate_globals(
                faf_meta=faf_meta,
                faf_index_dict=make_faf_index_dict(faf_meta))
            mt = mt.annotate_rows(popmax=mt.popmax.annotate(
                faf95=mt.faf[mt.faf_meta.index(
                    lambda x: x.values() == ["adj", mt.popmax.pop])].faf95))

            logger.info("Annotating quality metrics histograms...")
            # NOTE: these are performed here as the quality metrics histograms also require densifying
            mt = mt.annotate_rows(
                qual_hists=qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD, mt.adj))
            ht = mt.rows()
            ht = ht.annotate(
                qual_hists=hl.Struct(
                    **{
                        i.replace("_adj", ""): ht.qual_hists[i]
                        for i in ht.qual_hists if "_adj" in i
                    }),
                raw_qual_hists=hl.Struct(**{
                    i: ht.qual_hists[i]
                    for i in ht.qual_hists if "_adj" not in i
                }),
            )

            logger.info("Writing out frequency data...")
            if args.test:
                ht.write(get_checkpoint_path("chr20_test_freq"),
                         overwrite=True)
            else:
                ht.write(get_freq().path, overwrite=args.overwrite)

    finally:
        logger.info("Copying hail log to logging bucket...")
        hl.copy_log(f"{qc_temp_prefix()}logs/")

예제 #15

0

파일 보기

파일: hail_demo_sample_qc.py 프로젝트: antoniocampos13/portfolio

# Mixture of non-empty with empty PL fields causes problems with sample QC for some reason; setting field to all empty
mt = mt.annotate_entries(PL=hl.missing(mt.PL.dtype))

# Add variant-level annotations necessary for variant QC later
## Annotate variants in one of the categories: SNV, multi-SNV, indel, multi-indel, mixed
mt = mt.annotate_rows(**add_variant_type(mt.alleles))

## Number of alleles at the site
mt = mt.annotate_rows(n_alleles = hl.len(mt.alleles))

## Mixed sites (SNVs and indels present at the site)
mt = mt.annotate_rows(mixed_site = hl.if_else(mt.variant_type == "mixed", True, False))

## Spanning deletions
mt = mt.annotate_rows(spanning_deletion=hl.any(lambda a: a == "*", mt.alleles))

# Number of Rows, Columns
mt.count()

# Number of Columns
mt.count_cols()

# Variants breakdown
hl.summarize_variants(mt)

# Split variants with multiple alleles into biallelic configuration. Notice that hl.count() and hl.summarize_variants() will give different numbers after multi-allele sites splitting than before
mt = hl.split_multi_hts(mt)

# Remove monomorphic sites
mt = mt.filter_rows(mt.n_alleles > 1)

예제 #16

0

파일 보기

def load_icd_data(pre_phesant_data_path,
                  icd_codings_path,
                  temp_directory,
                  force_overwrite_intermediate: bool = False,
                  include_dates: bool = False,
                  icd9: bool = False):
    """
    Load raw (pre-PHESANT) phenotype data and extract ICD codes into hail MatrixTable with booleans as entries

    :param str pre_phesant_data_path: Input phenotype file
    :param str icd_codings_path: Input coding metadata
    :param str temp_directory: Temp bucket/directory to write intermediate file
    :param bool force_overwrite_intermediate: Whether to overwrite intermediate loaded file
    :param bool include_dates: Whether to also load date data (not implemented yet)
    :param bool icd9: Whether to load ICD9 data
    :return: MatrixTable with ICD codes
    :rtype: MatrixTable
    """
    if icd9:
        code_locations = {'primary_codes': '41203', 'secondary_codes': '41205'}
    else:
        code_locations = {
            'primary_codes': '41202',
            'secondary_codes': '41204',
            'external_codes': '41201',
            'cause_of_death_codes': '40001'
        }
    date_locations = {'primary_codes': '41262'}
    ht = hl.import_table(pre_phesant_data_path,
                         impute=not icd9,
                         min_partitions=100,
                         missing='',
                         key='userId',
                         types={'userId': hl.tint32})
    ht = ht.checkpoint(f'{temp_directory}/pre_phesant.ht',
                       _read_if_exists=not force_overwrite_intermediate)
    all_phenos = list(ht.row_value)
    fields_to_select = {
        code: [ht[x] for x in all_phenos if x.startswith(f'x{loc}')]
        for code, loc in code_locations.items()
    }
    if include_dates:
        fields_to_select.update({
            f'date_{code}':
            [ht[x] for x in all_phenos if x.startswith(f'x{loc}')]
            for code, loc in date_locations.items()
        })
    ht = ht.select(**fields_to_select)
    ht = ht.annotate(
        **{
            code: ht[code].filter(lambda x: hl.is_defined(x))
            for code in code_locations
        },
        # **{f'date_{code}': ht[code].filter(lambda x: hl.is_defined(x)) for code in date_locations}
    )
    # ht = ht.annotate(primary_codes_with_date=hl.dict(hl.zip(ht.primary_codes, ht.date_primary_codes)))
    all_codes = hl.sorted(
        hl.array(
            hl.set(
                hl.flatmap(
                    lambda x: hl.array(x),
                    ht.aggregate([
                        hl.agg.explode(lambda c: hl.agg.collect_as_set(c),
                                       ht[code]) for code in code_locations
                    ],
                                 _localize=True)))))
    ht = ht.select(bool_codes=all_codes.map(lambda x: hl.struct(
        **{code: ht[code].contains(x)
           for code in code_locations})))
    ht = ht.annotate_globals(
        all_codes=all_codes.map(lambda x: hl.struct(icd_code=x)))
    mt = ht._unlocalize_entries('bool_codes', 'all_codes', ['icd_code'])
    mt = mt.annotate_entries(
        any_codes=hl.any(lambda x: x, list(mt.entry.values())))
    # mt = mt.annotate_entries(date=hl.cond(mt.primary_codes, mt.primary_codes_with_date[mt.icd_code], hl.null(hl.tstr)))
    mt = mt.annotate_cols(truncated=False).annotate_globals(
        code_locations=code_locations)
    mt = mt.checkpoint(f'{temp_directory}/raw_icd.mt',
                       _read_if_exists=not force_overwrite_intermediate)
    trunc_mt = mt.filter_cols((hl.len(mt.icd_code) == 3)
                              | (hl.len(mt.icd_code) == 4))
    trunc_mt = trunc_mt.key_cols_by(icd_code=trunc_mt.icd_code[:3])
    trunc_mt = trunc_mt.group_cols_by('icd_code').aggregate_entries(
        **{
            code: hl.agg.any(trunc_mt[code])
            for code in list(code_locations.keys()) + ['any_codes']
        }).aggregate_cols(n_phenos_truncated=hl.agg.count()).result()
    trunc_mt = trunc_mt.filter_cols(trunc_mt.n_phenos_truncated > 1)
    trunc_mt = trunc_mt.annotate_cols(
        **mt.cols().drop('truncated', 'code_locations')[trunc_mt.icd_code],
        truncated=True).drop('n_phenos_truncated')
    mt = mt.union_cols(trunc_mt)
    coding_ht = hl.read_table(icd_codings_path)
    return mt.annotate_cols(**coding_ht[mt.col_key])

예제 #17

0

파일 보기

def annotate_variants(mt):
    '''
    Takes matrix table and annotates variants with gene, LOF and missense annotations by parsing VEP annotations.

    :param mt: matrix table to annotate
    :return: returns matrix table with new row annotations gene, LOF, and missense.
    '''
    try:
        test = hl.is_defined(mt.row.was_split)
    except Exception as e:
        print('Split multi-allelics before running!')
        print(e)
        return

    # If there is no canonical and protein-coding transcript consequence for that variant,
    # give the gene corresponding to the most severe consequence.
    # If there is a canonical and protein-coding transcript consequence for that variant,
    # give the gene symbol associated with that transcript consequence.
    canon_pc = mt.row.vep.transcript_consequences.filter(
        lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'))
    most_severe = mt.row.vep.transcript_consequences.filter(
        lambda x: x.consequence_terms.contains(mt.row.vep.
                                               most_severe_consequence))

    mt = mt.annotate_rows(gene=hl.if_else(
        hl.any(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'),
               mt.row.vep.transcript_consequences),
        canon_pc.map(lambda x: x.gene_symbol),
        most_severe.map(lambda x: x.gene_symbol)))

    # The above returns gene symbols for all canonical and protein coding transcripts, not just the one related to the
    # most severe consequence. So we will keep the above, but annotate also the gene corresponding to the most severe
    # consequence as well (useful for synonymous, missense, and LOF annotations)

    canon_pc = mt.row.vep.transcript_consequences.filter(
        lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding')
        & x.consequence_terms.contains(mt.vep.most_severe_consequence))
    most_severe = mt.vep.transcript_consequences.filter(
        lambda x: x.consequence_terms.contains(mt.row.vep.
                                               most_severe_consequence))

    mt = mt.annotate_rows(gene_most_severe_conseq=hl.if_else(
        hl.any(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'),
               mt.vep.transcript_consequences),
        canon_pc.map(lambda x: x.gene_symbol),
        most_severe.map(lambda x: x.gene_symbol)))

    # either if there is a canonical and protein coding transcript consequence for that variant,
    # and the lof annotation is not missing and equal to HC, and the lof flag is missing or is blank,
    # or if there isn't a canonical and protein coding transcript consequence for that variant and the
    # transcript consequence with consequence terms containing the most severe consequence term has lof not missing,
    # is equal to HC, and lof flags missing or blank,
    # true, else false

    canon_pc = mt.row.vep.transcript_consequences\
                         .filter(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'))
    most_severe = mt.row.vep.transcript_consequences\
                            .filter(lambda x: x.consequence_terms.contains(
                                              mt.row.vep.most_severe_consequence))

    canon_bool = (
        hl.any(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'),
               mt.row.vep.transcript_consequences)
        & hl.any(lambda x: hl.is_defined(x.lof), canon_pc) &
        (canon_pc.map(lambda x: x.lof) == ["HC"]) &
        (hl.all(lambda x: hl.is_missing(x.lof_flags) |
                (x.lof_flags == ""), canon_pc)))

    non_canon_bool = (~(hl.any(
        lambda x: (x.canonical == 1) &
        (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences))
                      & hl.any(lambda x: hl.is_defined(x.lof), most_severe) &
                      (most_severe.map(lambda x: x.lof) == ["HC"]) & (hl.all(
                          lambda x: hl.is_missing(x.lof_flags) |
                          (x.lof_flags == ""), most_severe)))

    mt = mt.annotate_rows(LOF=hl.if_else(canon_bool
                                         | non_canon_bool, True, False))

    # Either if there is a canonical and protein coding transcript consequence for that variant
    # whose consequence terms contain "missense variant"
    # or if there is not a canonical and protein coding transcript consequence for that variant,
    # but the most severe consequence is "missense variant"
    # or if if there is a canonical and protein coding transcript consequence for that variant
    # whose consequence terms contain "inframe deletion"
    # or if there is not a canonical and protein coding transcript consequence for that variant,
    # but the variant's most severe consequence is "inframe deletion"
    # true else false

    canon_pc = mt.row.vep.transcript_consequences\
                         .filter(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'))

    canon_missense_bool = canon_pc.map(lambda x: x.consequence_terms).contains(
        ["missense_variant"])
    noncanon_missense_bool = (~(hl.any(
        lambda x: (x.canonical == 1) &
        (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences)) &
                              (mt.row.vep.most_severe_consequence
                               == "missense_variant"))

    canon_inframe_bool = canon_pc.map(lambda x: x.consequence_terms).contains(
        ["inframe_deletion"])
    noncanon_inframe_bool = (~(hl.any(
        lambda x: (x.canonical == 1) &
        (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences)) &
                             (mt.row.vep.most_severe_consequence
                              == "inframe_deletion"))

    canon_inframe_ins_bool = canon_pc.map(
        lambda x: x.consequence_terms).contains(["inframe_insertion"])
    noncanon_inframe_ins_bool = (~(hl.any(
        lambda x: (x.canonical == 1) &
        (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences)) &
                                 (mt.row.vep.most_severe_consequence
                                  == "inframe_insertion"))

    mt = mt.annotate_rows(
        missense=hl.if_else((canon_missense_bool | noncanon_missense_bool
                             | canon_inframe_bool | noncanon_inframe_bool
                             | canon_inframe_ins_bool
                             | noncanon_inframe_ins_bool), True, False))

    # If the most severe consequence is "synonymous_variant", true else false
    mt = mt.annotate_rows(synonymous=hl.if_else(
        mt.row.vep.most_severe_consequence == "synonymous_variant", True,
        False))

    # When there is a transcript consequence for that variant that is canonical,
    # protein coding, and lof = "HC", its lof flags
    # When there is not a transcript consequence for that variant that is canonical and protein coding,
    # but there is a transcript consequence whose consequence terms contains the most severe consequence
    # and its lof == HC, its lof flags
    # else blank

    canon_bool = hl.any(
        lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'),
        mt.row.vep.transcript_consequences)
    canon_hc_bool = hl.any(
        lambda x:
        (x.canonical == 1) & (x.biotype == 'protein_coding') & (x.lof == 'HC'),
        mt.row.vep.transcript_consequences)
    canon_pc_hc = mt.row.vep.transcript_consequences.filter(lambda x: (
        x.canonical == 1) & (x.biotype == 'protein_coding') & (x.lof == "HC"))
    most_severe_bool = hl.any(
        lambda x:
        (x.consequence_terms.contains(mt.row.vep.most_severe_consequence)) &
        (x.lof == 'HC'), mt.row.vep.transcript_consequences)
    most_severe_hc = mt.row.vep.transcript_consequences.filter(lambda x: (
        x.consequence_terms.contains(mt.row.vep.most_severe_consequence)) &
                                                               (x.lof == "HC"))

    mt = mt.annotate_rows(LOF_flag=hl.case().when(
        canon_hc_bool, canon_pc_hc.map(lambda x: x.lof_flags)).when(
            ~canon_bool & most_severe_bool,
            most_severe_hc.map(lambda x: x.lof_flags)).default([""]))

    return mt

예제 #18

0

파일 보기

파일: import_gtf.py 프로젝트: jigold/hail

def get_gene_intervals(gene_symbols=None, gene_ids=None, transcript_ids=None,
                       verbose=True, reference_genome=None, gtf_file=None):
    """Get intervals of genes or transcripts.

    Get the boundaries of genes or transcripts from a GTF file, for quick filtering of a Table or MatrixTable.

    On Google Cloud platform:
    Gencode v19 (GRCh37) GTF available at: gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz
    Gencode v29 (GRCh38) GTF available at: gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz

    Example
    -------
    >>> hl.filter_intervals(ht, get_gene_intervals(gene_symbols=['PCSK9'], reference_genome='GRCh37'))  # doctest: +SKIP

    Parameters
    ----------

    gene_symbols : :obj:`list` of :obj:`str`, optional
       Gene symbols (e.g. PCSK9).
    gene_ids : :obj:`list` of :obj:`str`, optional
       Gene IDs (e.g. ENSG00000223972).
    transcript_ids : :obj:`list` of :obj:`str`, optional
       Transcript IDs (e.g. ENSG00000223972).
    verbose : :obj:`bool`
       If ``True``, print which genes and transcripts were matched in the GTF file.
    reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional
       Reference genome to use (passed along to import_gtf).
    gtf_file : :obj:`str`
       GTF file to load. If none is provided, but `reference_genome` is one of
       `GRCh37` or `GRCh38`, a default will be used (on Google Cloud Platform).

    Returns
    -------
    :obj:`list` of :class:`.Interval`
    """
    GTFS = {
        'GRCh37': 'gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz',
        'GRCh38': 'gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz',
    }
    if reference_genome is None:
        reference_genome = hl.default_reference().name
    if gtf_file is None:
        gtf_file = GTFS.get(reference_genome)
        if gtf_file is None:
            raise ValueError('get_gene_intervals requires a GTF file, or the reference genome be one of GRCh37 or GRCh38 (when on Google Cloud Platform)')
    if gene_symbols is None and gene_ids is None and transcript_ids is None:
        raise ValueError('get_gene_intervals requires at least one of gene_symbols, gene_ids, or transcript_ids')
    ht = hl.experimental.import_gtf(gtf_file, reference_genome=reference_genome,
                                    skip_invalid_contigs=True, min_partitions=12)
    ht = ht.annotate(gene_id=ht.gene_id.split(f'\\.')[0],
                     transcript_id=ht.transcript_id.split('\\.')[0])
    criteria = []
    if gene_symbols:
        criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_name == y), gene_symbols))
    if gene_ids:
        criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_id == y.split('\\.')[0]), gene_ids))
    if transcript_ids:
        criteria.append(hl.any(lambda y: (ht.feature == 'transcript') & (ht.transcript_id == y.split('\\.')[0]), transcript_ids))

    ht = ht.filter(functools.reduce(operator.ior, criteria))
    gene_info = ht.aggregate(hl.agg.collect((ht.feature, ht.gene_name, ht.gene_id, ht.transcript_id, ht.interval)))
    if verbose:
        info(f'get_gene_intervals found {len(gene_info)} entries:\n' +
             "\n".join(map(lambda x: f'{x[0]}: {x[1]} ({x[2] if x[0] == "gene" else x[3]})', gene_info)))
    intervals = list(map(lambda x: x[-1], gene_info))
    return intervals

예제 #19

0

파일 보기

def filter_by_frequency(
    t: Union[hl.MatrixTable, hl.Table],
    direction: str,
    frequency: float = None,
    allele_count: int = None,
    population: str = None,
    subpop: str = None,
    downsampling: int = None,
    keep: bool = True,
    adj: bool = True,
) -> Union[hl.MatrixTable, hl.Table]:
    """
    Filter MatrixTable or Table with gnomAD-format frequency data (assumed bi-allelic/split).

    gnomAD frequency data format expectation is: Array[Struct(Array[AC], Array[AF], AN, homozygote_count, meta)].

    At least one of frequency or allele_count is required.

    Subpop can be specified without a population if desired.

    :param t: Input MatrixTable or Table
    :param direction: One of "above", "below", and "equal" (how to apply the filter)
    :param frequency: Frequency to filter by (one of frequency or allele_count is required)
    :param allele_count: Allele count to filter by (one of frequency or allele_count is required)
    :param population: Population in which to filter frequency
    :param subpop: Sub-population in which to filter frequency
    :param downsampling: Downsampling in which to filter frequency
    :param keep: Whether to keep rows passing this frequency (passed to filter_rows)
    :param adj: Whether to use adj frequency
    :return: Filtered MatrixTable or Table
    """
    if frequency is None and allele_count is None:
        raise ValueError(
            "At least one of frequency or allele_count must be specified")
    if direction not in ("above", "below", "equal"):
        raise ValueError(
            'direction needs to be one of "above", "below", or "equal"')
    group = "adj" if adj else "raw"
    criteria = [lambda f: f.meta.get("group", "") == group]
    if frequency is not None:
        if direction == "above":
            criteria.append(lambda f: f.AF[1] > frequency)
        elif direction == "below":
            criteria.append(lambda f: f.AF[1] < frequency)
        else:
            criteria.append(lambda f: f.AF[1] == frequency)
    if allele_count is not None:
        if direction == "above":
            criteria.append(lambda f: f.AC[1] > allele_count)
        elif direction == "below":
            criteria.append(lambda f: f.AC[1] < allele_count)
        else:
            criteria.append(lambda f: f.AC[1] == allele_count)
    size = 1
    if population:
        criteria.append(lambda f: f.meta.get("pop", "") == population)
        size += 1
    if subpop:
        criteria.append(lambda f: f.meta.get("subpop", "") == subpop)
        size += 1
        # If one supplies a subpop but not a population, this will ensure this gets it right
        if not population:
            size += 1
    if downsampling:
        criteria.append(
            lambda f: f.meta.get("downsampling", "") == str(downsampling))
        size += 1
        if not population:
            size += 1
            criteria.append(lambda f: f.meta.get("pop", "") == "global")
        if subpop:
            raise Exception(
                "No downsampling data for subpopulations implemented")
    criteria.append(lambda f: f.meta.size() == size)

    def combine_functions(func_list, x):
        cond = func_list[0](x)
        for c in func_list[1:]:
            cond &= c(x)
        return cond

    filt = lambda x: combine_functions(criteria, x)
    criteria = hl.any(filt, t.freq)
    return (t.filter_rows(criteria, keep=keep) if isinstance(
        t, hl.MatrixTable) else t.filter(criteria, keep=keep))

예제 #20

0

파일 보기

파일: validity_checks.py 프로젝트: broadinstitute/gnomad_methods

def summarize_variant_filters(
    t: Union[hl.MatrixTable, hl.Table],
    variant_filter_field: str = "RF",
    problematic_regions: List[str] = ["lcr", "segdup", "nonpar"],
    single_filter_count: bool = False,
    monoallelic_expr: Optional[hl.expr.BooleanExpression] = None,
    extra_filter_checks: Optional[Dict[str, hl.expr.Expression]] = None,
    n_rows: int = 50,
    n_cols: int = 140,
) -> None:
    """
    Summarize variants filtered under various conditions in input MatrixTable or Table.

    Summarize counts for:
        - Total number of variants
        - Fraction of variants removed due to:
            - Any filter
            - Inbreeding coefficient filter in combination with any other filter
            - AC0 filter in combination with any other filter
            - `variant_filter_field` filtering in combination with any other filter in combination with any other filter
            - Only inbreeding coefficient filter
            - Only AC0 filter
            - Only `variant_filter_field` filtering

    :param t: Input MatrixTable or Table to be checked.
    :param variant_filter_field: String of variant filtration used in the filters annotation on `ht` (e.g. RF, VQSR, AS_VQSR). Default is "RF".
    :param problematic_regions: List of regions considered problematic to run filter check in. Default is ["lcr", "segdup", "nonpar"].
    :param single_filter_count: If True, explode the Table's filter column and give a supplement total count of each filter. Default is False.
    :param monoallelic_expr: Optional boolean expression of monoallelic status that logs how many monoallelic sites are in the Table.
    :param extra_filter_checks: Optional dictionary containing filter condition name (key) and extra filter expressions (value) to be examined.
    :param n_rows: Number of rows to display only when showing percentages of filtered variants grouped by multiple conditions. Default is 50.
    :param n_cols: Number of columns to display only when showing percentages of filtered variants grouped by multiple conditions. Default is 140.
    :return: None
    """
    t = t.rows() if isinstance(t, hl.MatrixTable) else t

    filters = t.aggregate(hl.agg.counter(t.filters))
    logger.info("Variant filter counts: %s", filters)

    if single_filter_count:
        exp_t = t.explode(t.filters)
        filters = exp_t.aggregate(hl.agg.counter(exp_t.filters))
        logger.info("Exploded variant filter counts: %s", filters)

    if monoallelic_expr is not None:
        if isinstance(t, hl.MatrixTable):
            mono_sites = t.filter_rows(monoallelic_expr).count_rows()
        else:
            mono_sites = t.filter(monoallelic_expr).count()
        logger.info("There are %d monoallelic sites in the dataset.",
                    mono_sites)

    filtered_expr = hl.len(t.filters) > 0
    problematic_region_expr = hl.any(
        lambda x: x, [t.info[region] for region in problematic_regions])

    t = t.annotate(is_filtered=filtered_expr,
                   in_problematic_region=problematic_region_expr)

    def _filter_agg_order(
        t: Union[hl.MatrixTable, hl.Table],
        group_exprs: Dict[str, hl.expr.Expression],
        n_rows: Optional[int] = None,
        n_cols: Optional[int] = None,
    ) -> None:
        """
        Perform validity checks to measure percentages of variants filtered under different conditions.

        :param t: Input MatrixTable or Table.
        :param group_exprs: Dictionary of expressions to group the Table by.
        :param extra_filter_checks: Optional dictionary containing filter condition name (key) and extra filter expressions (value) to be examined.
        :param n_rows: Number of rows to show. Default is None (to display 10 rows).
        :param n_cols: Number of columns to show. Default is None (to display 10 cols).
        :return: None
        """
        t = t.rows() if isinstance(t, hl.MatrixTable) else t
        # NOTE: make_filters_expr_dict returns a dict with %ages of variants filtered
        t.group_by(**group_exprs).aggregate(**make_filters_expr_dict(
            t, extra_filter_checks, variant_filter_field)).order_by(
                hl.desc("n")).show(n_rows, n_cols)

    logger.info(
        "Checking distributions of filtered variants amongst variant filters..."
    )
    _filter_agg_order(t, {"is_filtered": t.is_filtered})

    logger.info(
        "Checking distributions of variant type amongst variant filters...")
    _filter_agg_order(t, {"allele_type": t.info.allele_type})

    logger.info(
        "Checking distributions of variant type and region type amongst variant filters..."
    )
    _filter_agg_order(
        t,
        {
            "allele_type": t.info.allele_type,
            "in_problematic_region": t.in_problematic_region,
        },
        n_rows,
        n_cols,
    )

    logger.info(
        "Checking distributions of variant type, region type, and number of alt alleles amongst variant filters..."
    )
    _filter_agg_order(
        t,
        {
            "allele_type": t.info.allele_type,
            "in_problematic_region": t.in_problematic_region,
            "n_alt_alleles": t.info.n_alt_alleles,
        },
        n_rows,
        n_cols,
    )