Exemplo n.º 1
0
def generate_split_alleles(mt: hl.MatrixTable) -> hl.Table:

    allele_data = hl.struct(nonsplit_alleles=mt.alleles,
                            has_star=hl.any(lambda a: a == '*', mt.alleles))

    mt = mt.annotate_rows(allele_data=allele_data.annotate(
        **add_variant_type(mt.alleles)))

    #mt = mt.annotate_entries(PL=hl.empty_array(hl.tint32))
    mt = mt.annotate_entries(PL=hl.null(hl.tarray(hl.tint32)))
    #mt = hl.split_multi_hts(mt,left_aligned=True)
    mt = hl.split_multi_hts(mt)

    allele_type = (hl.case().when(
        hl.is_snp(mt.alleles[0], mt.alleles[1]),
        'snv').when(hl.is_insertion(mt.alleles[0], mt.alleles[1]),
                    'ins').when(hl.is_deletion(mt.alleles[0], mt.alleles[1]),
                                'del').default('complex'))
    mt = mt.annotate_rows(allele_data=mt.allele_data.annotate(
        allele_type=allele_type,
        was_mixed=mt.allele_data.variant_type == 'mixed'))
    return mt
Exemplo n.º 2
0
def annotate_adj(
    mt: hl.MatrixTable,
    adj_gq: int = 20,
    adj_dp: int = 10,
    adj_ab: float = 0.2,
    haploid_adj_dp: int = 10,
) -> hl.MatrixTable:
    """
    Annotate genotypes with adj criteria (assumes diploid)
    Defaults correspond to gnomAD values.
    """
    return mt.annotate_entries(adj=get_adj_expr(
        mt.GT, mt.GQ, mt.DP, mt.AD, adj_gq, adj_dp, adj_ab, haploid_adj_dp))
Exemplo n.º 3
0
def filter_sample_cr(mt: hl.MatrixTable,
                     mind: float) -> Tuple[hl.MatrixTable, Dict[str, int]]:
    # step 2
    mt = compute_qc_metrics(mt)

    sample_miss = mt.filter_cols(mt.sample_qc.call_rate < (1 - mind))
    sample_miss_cases = sample_miss.filter_cols(
        sample_miss.is_case == True).s.collect()
    sample_miss_controls = sample_miss.filter_cols(
        sample_miss.is_case == False).s.collect()
    n_sample_miss = len(sample_miss_cases) + len(sample_miss_controls)
    samples_miss = sample_miss_cases + sample_miss_controls
    if n_sample_miss > 0:
        mt = mt.filter_cols(hl.literal(samples_miss).contains(mt['s']),
                            keep=False)

    results = {
        'sample_miss_cases': len(sample_miss_cases),
        'sample_miss_controls': len(sample_miss_controls)
    }

    return mt, results
Exemplo n.º 4
0
def annotate_adj(mt: hl.MatrixTable) -> hl.MatrixTable:
    """
    Annotate genotypes with adj criteria (assumes diploid)
    """
    adj_gq = 20
    adj_dp = 10
    adj_ab = 0.2

    return mt.annotate_entries(
        adj=(mt.GQ >= adj_gq) & (mt.DP >= adj_dp) & (~mt.GT.is_het() | (
            (mt.GT[0] == 0) & (mt.AD[mt.GT[1]] / mt.DP >= adj_ab)) | (
                (mt.GT[0] > 0) & (mt.AD[mt.GT[0]] / mt.DP >= adj_ab) &
                (mt.AD[mt.GT[1]] / mt.DP >= adj_ab))))
def annotate_samples_with_cohort_info(mt: hl.MatrixTable,
                                      cohort_file) -> hl.MatrixTable:
    '''

    :param mt: matrixtable with cohort samples and variants
    :param cohort_file: a txt file with no header line and 2 columns, 1st: for sampleID;  2nd: cohortname; example:/lustre/scratch115/projects/autozyg/new_autozyg_DDD_callset.April2019/sample_list.after_QC.ELGH_BiB_Birm_controls_only.with_cohort_labels.txt
    :return: matrixtable with new column annotation
    '''
    #import the tab delimited file. Note that it is important for joins of tables to have defined keys in the hail tables
    table_cohort = hl.import_table(cohort_file, no_header=True, key='f0')
    #annotate the samples with a new attribute called cohort:
    mt_result = mt.annotate_cols(cohort=table_cohort[mt.s].f1)
    return mt_result
def generate_trio_stats(mt: hl.MatrixTable,
                        autosomes_only: bool = True,
                        bi_allelic_only: bool = True) -> hl.Table:
    """
    Default function to run `generate_trio_stats_expr` to get trio stats stratified by raw and adj
    .. note::
        Expects that `mt` is it a trio matrix table that was annotated with adj and if dealing with
        a sparse MT `hl.experimental.densify` must be run first.
        By default this pipeline function will filter `mt` to only autosomes and bi-allelic sites.
    :param mt: A Trio Matrix Table returned from `hl.trio_matrix`. Must be dense
    :param autosomes_only: If set, only autosomal intervals are used.
    :param bi_allelic_only: If set, only bi-allelic sites are used for the computation
    :return: Table with trio stats
    """
    if autosomes_only:
        mt = filter_to_autosomes(mt)
    if bi_allelic_only:
        mt = mt.filter_rows(bi_allelic_expr(mt))

    logger.info(f"Generating trio stats using {mt.count_cols()} trios.")
    trio_adj = mt.proband_entry.adj & mt.father_entry.adj & mt.mother_entry.adj

    ht = mt.select_rows(**generate_trio_stats_expr(
        mt,
        transmitted_strata={
            "raw": True,
            "adj": trio_adj
        },
        de_novo_strata={
            "raw": True,
            "adj": trio_adj
        },
        ac_strata={
            "raw": True,
            "adj": trio_adj
        },
    )).rows()

    return ht
Exemplo n.º 7
0
def combine_phenotypes(mt: hl.MatrixTable,
                       column_field,
                       entry_field,
                       lists_of_columns,
                       new_col_name='grouping',
                       new_entry_name='new_entry',
                       grouping_function=hl.agg.any):
    """
    Group by non-unique fields and apply grouping_function in order to combine entries in MatrixTable.

    Example:

    mt = hl.balding_nichols_model(1, 4, 10)
    mt = mt.annotate_entries(pheno=hl.rand_bool(0.5))
    lists_of_columns = [[0, 1], [0, 3]]
    entry_field = mt.pheno
    column_field = mt.sample_idx

    :param MatrixTable mt: Input MatrixTable
    :param Expression column_field: Column-indexed Expression to group by
    :param Expression entry_field: Entry-indexed Expression to which to apply `grouping_function`
    :param list of list lists_of_columns: Entry in this list should be the same type as `column_field`
    :param str new_col_name: Name for new column key (default 'grouping')
    :param str new_entry_name: Name for new entry expression (default 'new_entry')
    :param function grouping_function: Aggregator function to apply to `entry_field` (default hl.agg.any)
    :return: Re-grouped MatrixTable
    :rtype: MatrixTable
    """
    lists_of_columns = hl.literal(lists_of_columns)
    mt = mt._annotate_all(col_exprs={'_col_expr': column_field},
                          entry_exprs={'_entry_expr': entry_field})
    mt = mt.annotate_cols(**{
        new_col_name:
        lists_of_columns.filter(lambda x: x.contains(mt._col_expr))
    })
    mt = mt.explode_cols(new_col_name)
    return mt.group_cols_by(new_col_name).aggregate(
        **{new_entry_name: grouping_function(mt._entry_expr)})
Exemplo n.º 8
0
def generate_ac(mt: hl.MatrixTable) -> hl.Table:
    """
    Creates Table containing allele counts per variant.

    Returns table containing the following annotations:
        - `ac_qc_samples_raw`: Allele count of high quality samples
        - `ac_qc_samples_unrelated_raw`: Allele count of high quality unrelated samples
        - `ac_release_samples_raw`: Allele count of release samples
        - `ac_qc_samples_adj`: Allele count of high quality samples after adj filtering
        - `ac_qc_samples_unrelated_adj`: Allele count of high quality unrelated samples after adj filtering
        - `ac_release_samples_adj`: Allele count of release samples after adj filtering

    :param mt: Input MatrixTable
    :return: Table containing allele counts
    """
    mt = mt.filter_cols(mt.meta.high_quality)
    mt = mt.filter_rows(hl.len(mt.alleles) > 1)
    mt = annotate_adj(mt)
    mt = mt.annotate_rows(
        ac_qc_samples_raw=hl.agg.sum(mt.GT.n_alt_alleles()),
        ac_qc_samples_unrelated_raw=hl.agg.filter(
            ~mt.meta.sample_filters.all_samples_related,
            hl.agg.sum(mt.GT.n_alt_alleles()),
        ),
        ac_release_samples_raw=hl.agg.filter(mt.meta.release,
                                             hl.agg.sum(
                                                 mt.GT.n_alt_alleles())),
        ac_qc_samples_adj=hl.agg.filter(mt.adj,
                                        hl.agg.sum(mt.GT.n_alt_alleles())),
        ac_qc_samples_unrelated_adj=hl.agg.filter(
            ~mt.meta.sample_filters.all_samples_related & mt.adj,
            hl.agg.sum(mt.GT.n_alt_alleles()),
        ),
        ac_release_samples_adj=hl.agg.filter(mt.meta.release & mt.adj,
                                             hl.agg.sum(
                                                 mt.GT.n_alt_alleles())),
    )
    return mt.rows()
Exemplo n.º 9
0
def _require_row_variant_w_struct_locus(mt: MatrixTable) -> NoReturn:
    """
    Similar to hail.methods.misc.require_row_key_variant_w_struct_locus, but not necessarily as keys
    """
    assert check_argument_types()

    if (not set(['locus', 'alleles']).issubset(set(mt.rows().row))
            or not mt['alleles'].dtype == tarray(tstr)
            or (not isinstance(mt['locus'].dtype, tlocus) and mt['locus'].dtype
                != hl.dtype('struct{contig: str, position: int32}'))):
        raise ValueError(
            "'hail.from_matrix_table' requires row to contain two fields 'locus'"
            " (type 'locus<any>' or 'struct{{contig: str, position: int32}}') and "
            "'alleles' (type 'array<str>')")
Exemplo n.º 10
0
def annotate_with_dbsnp(ht: hl.MatrixTable, dbsnpt: hl.Table):
    newht = ht.annotate_rows(
        #dbnsfp_DANN_score=".",
        dbnsfp_GERP_RS=dbsnpt.index(ht.locus, ht.alleles)['GERP++_RS'],
        dbnsfp_MutationTaster_pred=dbsnpt.index(
            ht.locus, ht.alleles).MutationTaster_pred,
        dbnsfp_phastCons100way_vertebrate=dbsnpt.index(
            ht.locus, ht.alleles).phastCons100way_vertebrate,
        dbnsfp_Polyphen2_HVAR_pred=dbsnpt.index(
            ht.locus, ht.alleles).Polyphen2_HVAR_pred,
        dbnsfp_MetaSVM_pred=dbsnpt.index(ht.locus, ht.alleles).MetaSVM_pred,
        dbnsfp_REVEL_score=dbsnpt.index(ht.locus, ht.alleles).REVEL_score,
        dbnsfp_SIFT_pred=dbsnpt.index(ht.locus, ht.alleles).SIFT_pred)
    return newht
Exemplo n.º 11
0
def add_coding_information(mt: hl.MatrixTable, coding_ht: hl.Table, phesant_phenotype_info_path: str,
                           download_missing_codings: bool = False) -> hl.MatrixTable:
    """
    Add coding information from coding_ht as column annotations into mt

    :param MatrixTable mt: Input MT
    :param Table coding_ht: HT with coding information
    :param str phesant_phenotype_info_path: PHESANT phenotype metadata path
    :param bool download_missing_codings: Whether to download missing coding data
    :return: MT with coding information in column data
    :rtype: MatrixTable
    """
    mt = mt.annotate_cols(**coding_ht[(mt.coding_id, hl.str(mt.coding))])
    if download_missing_codings: get_missing_codings(mt.cols())
    phesant_summary = hl.import_table(phesant_phenotype_info_path, impute=True, missing='', key='FieldID')
    phesant_reassign = get_phesant_reassignments(phesant_summary)
    mt = mt.annotate_cols(recoding=hl.or_missing(
        hl.is_missing(mt.meaning), phesant_reassign[mt.col_key.select('phenocode', 'coding')].reassign_from
    ))
    return mt.annotate_cols(**hl.cond(hl.is_defined(mt.meaning),
                                      hl.struct(**{x: mt[x] for x in list(coding_ht.row_value)}),
                                      coding_ht[(mt.coding_id, hl.str(mt.recoding))]),
                            )
Exemplo n.º 12
0
def filter_interval(mt: hl.MatrixTable,
                    tb_bed: hl.Table) -> hl.MatrixTable:
    """
    Keep variants defined in target regions (BED file)

    :param mt: Hail MatrixTable
    :param tb_bed: HailTable with interval field generated with hl.import_bed function
    :return: Row-filtered MatrixTable
    """
    n_total = mt.count_rows()

    mt = (mt
          .filter_rows(hl.is_defined((tb_bed[mt.locus])),
                       keep=True)
          )

    n_filtered = n_total - mt.count_rows()

    pct = round((n_filtered / n_total) * 100, 2)

    print(f"Filtered {n_filtered} ({pct}%) non-covered variants out of {n_total}")

    return mt
Exemplo n.º 13
0
def apply_variant_qc_filtering(mt: hl.MatrixTable) -> hl.MatrixTable:
    """
    Apply variant QC filtering

    :param mt: hl.MatrixTable
    :return: hl.MatrixTable
    """
    # import variant qc final table
    variant_qc_ht = hl.read_table(get_variant_qc_ht_path(part='final_qc'))
    mt = (mt.annotate_rows(**variant_qc_ht[mt.row_key]))
    mt = (mt.filter_rows(~mt.fail_inbreeding_coeff & ~mt.AC0 & ~mt.fail_vqsr
                         & ~mt.fail_rf & mt.is_coveraged_gnomad_genomes
                         & mt.is_defined_capture_intervals))
    # filter low conf regions
    mt = filter_low_conf_regions(
        mt,
        filter_lcr=
        True,  # TODO: include also decoy and low coverage exome regions
        filter_segdup=True)
    # filter telomeres/centromes
    mt = remove_telomeres_centromes(mt)

    return mt
Exemplo n.º 14
0
def pc_project(mt: hl.MatrixTable,
               pc_loadings: hl.Table,
               loading_location: str = "loadings",
               af_location: str = "pca_af") -> hl.MatrixTable:
    """
    Projects samples in `mt` on PCs computed in `pc_mt`
    :param MatrixTable mt: MT containing the samples to project
    :param Table pc_loadings: MT containing the PC loadings for the variants
    :param str loading_location: Location of expression for loadings in `pc_loadings`
    :param str af_location: Location of expression for allele frequency in `pc_loadings`
    :return: MT with scores calculated from loadings
    """
    n_variants = mt.count_rows()

    mt = mt.annotate_rows(**pc_loadings[mt.locus, mt.alleles])
    mt = mt.filter_rows(
        hl.is_defined(mt[loading_location]) & hl.is_defined(mt[af_location])
        & (mt[af_location] > 0) & (mt[af_location] < 1))

    gt_norm = (mt.GT.n_alt_alleles() - 2 * mt[af_location]) / hl.sqrt(
        n_variants * 2 * mt[af_location] * (1 - mt[af_location]))
    return mt.annotate_cols(pca_scores=hl.agg.array_sum(mt[loading_location] *
                                                        gt_norm))
Exemplo n.º 15
0
def get_projectmax(mt: hl.MatrixTable,
                   loc: hl.expr.StringExpression) -> hl.MatrixTable:
    """
    First pass of projectmax (returns aggregated MT with project_max field)

    :param MatrixTable mt: Input MT
    :param StringExpression loc: Column expression location of project ID (e.g. mt.meta.pid)
    :return: Frequency data with annotated project_max
    :rtype: MatrixTable
    """
    # TODO: add hom count
    mt = mt.annotate_cols(project=loc)
    agg_mt = mt.group_cols_by(mt.project).aggregate(
        ac=hl.agg.sum(mt.GT.n_alt_alleles()),
        an=2 * hl.agg.count_where(hl.is_defined(mt.GT)),
        hom=hl.agg.count_where(mt.GT.n_alt_alleles() == 2))
    agg_mt = agg_mt.annotate_entries(af=agg_mt.ac / agg_mt.an)
    return agg_mt.annotate_rows(project_max=hl.agg.take(
        hl.struct(project=agg_mt.project,
                  ac=agg_mt.ac,
                  af=agg_mt.af,
                  an=agg_mt.an,
                  hom=agg_mt.hom), 5, -agg_mt.af))
def generate_family_stats(
        mt: hl.MatrixTable,
        fam_file: str,
        calculate_adj: bool = False) -> Tuple[hl.Table, hl.Table]:
    """
    Writes bi-allelic sites MT with the following annotations:
     - family_stats (TDT, Mendel Errors, AC_unrelated_qc)
     - truth_data (presence in Omni, HapMap, 1KG high conf SNVs, Mills)

    :param MatrixTable mt: Full MT
    :param str fam_file: Fam pedigree file location
    :param bool calculate_adj: Whether to also calculate family metrics for adj genotypes
    :return: Table with qc annotations
    :rtype: Table
    """
    #mt = mt.select_cols(high_quality=mt.meta.high_quality)
    mt = mt.select_rows()
    mt = annotate_unrelated_sample(mt, fam_file)

    # Unphased for now, since mendel_errors does not support phased alleles
    mt = mt.annotate_entries(GT=unphase_call_expr(mt.GT))
    ped = hl.Pedigree.read(fam_file, delimiter='\\t')
    family_stats_struct, family_stats_sample_ht = family_stats(mt, ped, 'raw')
    mt = mt.annotate_rows(family_stats=[family_stats_struct])

    if calculate_adj:
        mt = filter_to_adj(mt)
        adj_family_stats_struct, adj_family_stats_sample_ht = family_stats(
            mt, ped, 'adj')

        family_stats_sample_ht = family_stats_sample_ht.annotate(
            adj=adj_family_stats_sample_ht[family_stats_sample_ht.s])

        mt = mt.annotate_rows(
            family_stats=mt.family_stats.append(adj_family_stats_struct))

    return mt.rows(), family_stats_sample_ht
Exemplo n.º 17
0
def filter_low_conf_regions(
        mt: hl.MatrixTable,
        filter_lcr: bool = True,
        filter_decoy: bool = True,
        filter_segdup: bool = True,
        high_conf_regions: Optional[List[str]] = None) -> hl.MatrixTable:
    """
    Filters low-confidence regions

    :param MatrixTable mt: MT to filter
    :param bool filter_lcr: Whether to filter LCR regions
    :param bool filter_decoy: Whether to filter decoy regions
    :param bool filter_segdup: Whether to filter Segdup regions
    :param list of str high_conf_regions: Paths to set of high confidence regions to restrict to (union of regions)
    :return: MT with low confidence regions removed
    :rtype: MatrixTable
    """
    from gnomad_hail.resources import lcr_intervals_path, decoy_intervals_path, segdup_intervals_path

    if filter_lcr:
        lcr = hl.import_locus_intervals(lcr_intervals_path)
        mt = mt.filter_rows(hl.is_defined(lcr[mt.locus]), keep=False)

    if filter_decoy:
        decoy = hl.import_bed(decoy_intervals_path)
        mt = mt.filter_rows(hl.is_defined(decoy[mt.locus]), keep=False)

    if filter_segdup:
        segdup = hl.import_bed(segdup_intervals_path)
        mt = mt.filter_rows(hl.is_defined(segdup[mt.locus]), keep=False)

    if high_conf_regions is not None:
        for region in high_conf_regions:
            region = hl.import_locus_intervals(region)
            mt = mt.filter_rows(hl.is_defined(region[mt.locus]), keep=True)

    return mt
Exemplo n.º 18
0
def apply_mito_artifact_filter(
    mt: hl.MatrixTable,
    artifact_prone_sites_path: str,
) -> hl.MatrixTable:
    """Add back in artifact_prone_site filter

    :param hl.MatrixTable mt: MatrixTable to use an input
    :param str artifact_prone_sites_path: path to BED file of artifact_prone_sites to flag in the filters column

    :return: MatrixTable with artifact_prone_sites filter
    :rtype: hl.MatrixTable

    """

    # apply "artifact_prone_site" filter to any SNP or deletion that spans a known problematic site
    mt = mt.annotate_rows(
        position_range=hl.range(mt.locus.position, mt.locus.position +
                                hl.len(mt.alleles[0])))

    artifact_sites = []

    with open(artifact_prone_sites_path, "r") as f:
        for line in f:
            pos = line.split()[2]
            artifact_sites.append(int(pos))
    sites = hl.literal(set(artifact_sites))

    mt = mt.annotate_rows(filters=hl.if_else(
        hl.len(hl.set(mt.position_range).intersection(sites)) > 0,
        {"artifact_prone_site"},
        {"PASS"},
    ))

    mt = mt.drop("position_range")

    return mt
Exemplo n.º 19
0
def adjust_sex_ploidy(mt: hl.MatrixTable,
                      sex_expr: hl.expr.StringExpression,
                      male_str: str = 'male',
                      female_str: str = 'female') -> hl.MatrixTable:
    """
    Converts males to haploid on non-PAR X/Y, sets females to missing on Y

    :param mt: Input MatrixTable
    :param sex_expr: Expression pointing to sex in MT (if not male_str or female_str, no change)
    :param male_str: String for males (default 'male')
    :param female_str: String for females (default 'female')
    :return: MatrixTable with fixed ploidy for sex chromosomes
    """
    return mt.annotate_entries(GT=adjusted_sex_ploidy_expr(
        mt.locus, mt.GT, sex_expr, male_str, female_str))
Exemplo n.º 20
0
def pc_project(mt: hl.MatrixTable,
               loadings_ht: hl.Table,
               loading_location: str = "loadings",
               af_location: str = "pca_af"):
    """
    Projects samples in `mt` on pre-computed PCs.
    :param MatrixTable mt: MT containing the samples to project
    :param Table loadings_ht: HT containing the PCA loadings and allele frequencies used for the PCA
    :param str loading_location: Location of expression for loadings in `loadings_ht`
    :param str af_location: Location of expression for allele frequency in `loadings_ht`
    :return: Table with scores calculated from loadings in column `scores`
    :rtype: Table
    """
    n_variants = loadings_ht.count()
    mt = mt.annotate_rows(
        pca_loadings=loadings_ht[mt.row_key][loading_location],
        pca_af=loadings_ht[mt.row_key][af_location])
    mt = mt.filter_rows(
        hl.is_defined(mt.pca_loadings) & hl.is_defined(mt.pca_af)
        & (mt.pca_af > 0) & (mt.pca_af < 1))
    gt_norm = (mt.GT.n_alt_alleles() - 2 * mt.pca_af) / hl.sqrt(
        n_variants * 2 * mt.pca_af * (1 - mt.pca_af))
    mt = mt.annotate_cols(scores=hl.agg.array_sum(mt.pca_loadings * gt_norm))
    return mt.cols().select('scores')
Exemplo n.º 21
0
def determine_hom_refs(mt: hl.MatrixTable,
                       coverage_mt_path: str,
                       minimum_homref_coverage: int = 100) -> hl.MatrixTable:
    """Uses coverage to distinguish between homref and missing sites, outputs the resulting mt

    :param hl.MatrixTable mt: MatrixTable to use an input
    :param str coverage_mt_path: MatrixTable of sample level coverage at each position
    :param int minimum_homref_coverage: minimum depth of coverage required to call a genotype homoplasmic reference rather than missing

    :return: MatrixTable with missing genotypes converted to homref depending on coverage
    :rtype: hl.MatrixTable
    """

    # convert coverage to build GRCh37
    # note: the mitochondrial reference genome is the same for GRCh38 and GRCh37
    coverages = hl.read_matrix_table(coverage_mt_path)
    coverages = coverages.key_rows_by(locus=hl.locus(
        "MT", coverages.locus.position, reference_genome="GRCh37"))

    mt = mt.annotate_entries(
        DP=hl.if_else(hl.is_missing(mt.HL), coverages[mt.locus,
                                                      mt.s].coverage, mt.DP))

    hom_ref_expr = hl.is_missing(mt.HL) & (mt.DP > minimum_homref_coverage)

    mt = mt.annotate_entries(
        HL=hl.if_else(hom_ref_expr, 0.0, mt.HL),
        FT=hl.if_else(hom_ref_expr, ["PASS"], mt.FT),
        DP=hl.if_else(
            hl.is_missing(mt.HL) & (mt.DP <= minimum_homref_coverage),
            hl.null(hl.tint32),
            mt.DP,
        ),
    )

    return mt
Exemplo n.º 22
0
def pca_filter_mt(in_mt: hl.MatrixTable,
                  maf: float = 0.05,
                  hwe: float = 1e-3,
                  call_rate: float = 0.98,
                  ld_cor: float = 0.2,
                  ld_window: int = 250000):

    print("\nInitial number of SNPs before filtering: {}".format(
        in_mt.count_rows()))
    mt = hl.variant_qc(in_mt)
    print(f'\nFiltering out variants with MAF < {maf}')
    mt_filt = mt.annotate_rows(maf=hl.min(mt.variant_qc.AF))
    mt_filt = mt_filt.filter_rows(mt_filt.maf > maf)

    print(f'\nFiltering out variants with HWE < {hwe:1e}')
    mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.p_value_hwe > hwe)

    print(f'\nFiltering out variants with Call Rate < {call_rate}')
    mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.call_rate >= call_rate)

    # no strand ambiguity
    print('\nFiltering out strand ambigous variants')
    mt_filt = mt_filt.filter_rows(
        ~hl.is_strand_ambiguous(mt_filt.alleles[0], mt_filt.alleles[1]))

    # MHC chr6:25-35Mb
    # chr8.inversion chr8:7-13Mb
    print(
        '\nFiltering out variants in MHC [chr6:25M-35M] and chromosome 8 inversions [chr8:7M-13M]'
    )
    intervals = ['chr6:25M-35M', 'chr8:7M-13M']
    mt_filt = hl.filter_intervals(mt_filt, [
        hl.parse_locus_interval(x, reference_genome='GRCh38')
        for x in intervals
    ],
                                  keep=False)

    # This step is expensive (on local machine)
    print(
        f'\nLD pruning using correlation threshold of {ld_cor} and window size of {ld_window}'
    )
    mt_ld_prune = hl.ld_prune(mt_filt.GT, r2=ld_cor, bp_window_size=ld_window)
    mt_ld_pruned = mt_filt.filter_rows(
        hl.is_defined(mt_ld_prune[mt_filt.row_key]))
    print("\nNumber of SNPs after filtering: {}".format(
        mt_ld_pruned.count_rows()))

    return mt_ld_pruned
Exemplo n.º 23
0
def get_all_sample_metadata(
    mt: hl.MatrixTable, build: int, data_type: str, data_source: str, version: int
) -> hl.Table:
    """
    Annotate MatrixTable with all current metadata: sample sequencing metrics, sample ID mapping,
    and callrate for bi-allelic, high-callrate common SNPs.
    :param MatrixTable mt: VCF converted to a MatrixTable
    :param int build: build for write, 37 or 38
    :param str data_type: WGS or WES for write path and flagging metrics
    :param str data_source: internal or external for write path
    :param int version: Int for write path
    :return: Table with seq metrics and mapping
    :rtype: Table
    """
    logger.info("Importing and annotating with sequencing metrics...")
    meta_ht = hl.import_table(
        seq_metrics_path(build, data_type, data_source, version), impute=True
    ).key_by("SAMPLE")

    logger.info("Importing and annotating seqr ID names...")
    remap_ht = hl.import_table(
        remap_path(build, data_type, data_source, version), impute=True
    ).key_by("s")
    meta_ht = meta_ht.annotate(**remap_ht[meta_ht.key])
    meta_ht = meta_ht.annotate(
        seqr_id=hl.if_else(
            hl.is_missing(meta_ht.seqr_id), meta_ht.SAMPLE, meta_ht.seqr_id
        )
    )

    logger.info(
        "Filtering to bi-allelic, high-callrate, common SNPs to calculate callrate..."
    )
    mt = filter_rows_for_qc(
        mt,
        min_af=0.001,
        min_callrate=0.99,
        bi_allelic_only=True,
        snv_only=True,
        apply_hard_filters=False,
        min_inbreeding_coeff_threshold=None,
        min_hardy_weinberg_threshold=None,
    )
    callrate_ht = mt.select_cols(
        filtered_callrate=hl.agg.fraction(hl.is_defined(mt.GT))
    ).cols()
    meta_ht = meta_ht.annotate(**callrate_ht[meta_ht.key])
    return meta_ht
Exemplo n.º 24
0
def get_an_criteria(
    mt: hl.MatrixTable,
    samples_by_sex: Optional[Dict[str, int]] = None,
    meta_root: str = "meta",
    sex_field: str = "sex_imputation.sex_karyotype",
    xy_str: str = "XY",
    xx_str: str = "XX",
    freq_field: str = "freq",
    freq_index: int = 0,
    an_proportion_cutoff: float = 0.8,
) -> hl.expr.BooleanExpression:
    """
    Generate criteria to filter samples based on allele number (AN).

    Uses allele number as proxy for call rate.

    :param mt: Input MatrixTable.
    :param samples_by_sex: Optional Dictionary containing number of samples (value) for each sample sex (key).
    :param meta_root: Name of field in MatrixTable containing sample metadata information. Default is 'meta'.
    :param sex_field: Name of field in MatrixTable containing sample sex assignment. Defualt is 'sex_imputation.sex_karyotype'.
    :param xy_str: String marking whether a sample has XY sex. Default is 'XY'.
    :param xx_str: String marking whether a sample has XX sex. Default is 'XX'.
    :param freq_field: Name of field in MT that contains frequency information. Default is 'freq'.
    :param freq_index: Which index of frequency struct to use. Default is 0.
    :param an_proportion_cutoff: Desired allele number proportion cutoff. Default is 0.8.
    """
    if samples_by_sex is None:
        samples_by_sex = mt.aggregate_cols(hl.agg.counter(mt[meta_root][sex_field]))
    return (
        hl.case()
        .when(
            mt.locus.in_autosome_or_par(),
            mt[freq_field][freq_index].AN
            >= an_proportion_cutoff * 2 * sum(samples_by_sex.values()),
        )
        .when(
            mt.locus.in_x_nonpar(),
            mt[freq_field][freq_index].AN
            >= an_proportion_cutoff
            * (samples_by_sex[xy_str] + samples_by_sex[xx_str] * 2),
        )
        .when(
            mt.locus.in_y_nonpar(),
            mt[freq_field][freq_index].AN
            >= an_proportion_cutoff * samples_by_sex[xy_str],
        )
        .or_missing()
    )
Exemplo n.º 25
0
def filter_snps_cutoffs(mt: hl.MatrixTable) -> hl.MatrixTable:
    '''
    Filter snps based on Hilary's cutoffs
    :param mt: snps only hail matrixtable
    :return: filtered matrixtable based on the  cutoffs listed in cutoffs.py file

    '''
    mt_snps_filtered = mt.filter_rows(
        (mt.info.QD >= QD_cutoff) & (mt.info.FS < FS_snps_cutoff)
        & (mt.info.MQ > MQ_cutoff) & (mt.info.MQRankSum >= MQRankSum_cutoff)
        & (mt.info.ReadPosRankSum >= ReadPosRankSum_cutoff))
    #additional filtering
    mt_snps_filtered2 = mt_snps_filtered.filter_entries(
        (mt_snps_filtered.GQ > GQ_cutoff))

    return mt_snps_filtered2
Exemplo n.º 26
0
def filter_genotypes_ab(mt: hl.MatrixTable) -> hl.MatrixTable:
    """
    Filter high-quality genotypes based on allelic-balance expression.
    Expected AD and GT in entries fields.
    Rules:
      hom_ref: ab <= 0.1
      hets: 0.2 >= ab <= 0.8
      hom_var: ab >= 0.9

    :param mt: Input MT
    :return: Genotype-filtered MT
    """
    ab = mt.AD[1] / hl.sum(mt.AD)
    filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) |
                           (mt.GT.is_het() & (ab >= 0.2) &
                            (ab <= 0.8)) | (mt.GT.is_hom_var() & (ab >= 0.9)))
    return mt.filter_entries(filter_condition_ab, keep=True)
Exemplo n.º 27
0
def filter_mt_to_trios(mt: hl.MatrixTable, fam_ht: hl.Table) -> hl.MatrixTable:
    """
    Filters a MatrixTable to a set of trios in `fam_ht`, filters to autosomes, and annotates with adj.

    :param mt: A Matrix Table to filter to only trios
    :param fam_ht: A Table of trios to filter to, loaded using `hl.import_fam`
    :return: A MT filtered to trios and adj annotated
    """
    # Filter MT to samples present in any of the trios
    fam_ht = fam_ht.annotate(fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id])
    fam_ht = fam_ht.explode("fam_members", name="s")
    fam_ht = fam_ht.key_by("s").select().distinct()

    mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key]))
    mt = filter_to_autosomes(mt)
    mt = annotate_adj(mt)

    return mt
Exemplo n.º 28
0
def run_pca(mt: hl.MatrixTable, out_prefix: str, overwrite: bool = False):
    """
    Run PCA on a dataset
    :param mt: dataset to run PCA on
    :param out_prefix: directory and filename prefix for where to put PCA output
    :return:
    """
    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
        mt.GT, k=20, compute_loadings=True)
    pca_mt = mt.annotate_rows(pca_af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)
    pca_loadings = pca_loadings.annotate(
        pca_af=pca_mt.rows()[pca_loadings.key].pca_af)

    pca_scores.write(out_prefix + 'scores.ht', overwrite)
    pca_scores = hl.read_table(out_prefix + 'scores.ht')
    pca_scores = pca_scores.transmute(
        **{f'PC{i}': pca_scores.scores[i - 1]
           for i in range(1, 21)})
    pca_scores.export(out_prefix + 'scores.txt.bgz')  # individual-level PCs

    pca_loadings.write(out_prefix + 'loadings.ht', overwrite)  # PCA loadings
Exemplo n.º 29
0
def subset_samples_and_variants(
    mt: hl.MatrixTable,
    sample_path: str,
    header: bool = True,
    table_key: str = "s",
    sparse: bool = False,
    gt_expr: str = "GT",
) -> hl.MatrixTable:
    """
    Subset the MatrixTable to the provided list of samples and their variants.

    :param mt: Input MatrixTable
    :param sample_path: Path to a file with list of samples
    :param header: Whether file with samples has a header. Default is True
    :param table_key: Key to sample Table. Default is "s"
    :param sparse: Whether the MatrixTable is sparse. Default is False
    :param gt_expr: Name of field in MatrixTable containing genotype expression. Default is "GT"
    :return: MatrixTable subsetted to specified samples and their variants
    """
    sample_ht = hl.import_table(sample_path,
                                no_header=not header,
                                key=table_key)
    sample_count = sample_ht.count()
    missing_ht = sample_ht.anti_join(mt.cols())
    missing_ht_count = missing_ht.count()
    full_count = mt.count_cols()

    if missing_ht_count != 0:
        missing_samples = missing_ht.s.collect()
        raise DataException(
            f"Only {sample_count - missing_ht_count} out of {sample_count} "
            "subsetting-table IDs matched IDs in the MT.\n"
            f"IDs that aren't in the MT: {missing_samples}\n")

    mt = mt.semi_join_cols(sample_ht)
    if sparse:
        mt = mt.filter_rows(
            hl.agg.any(mt[gt_expr].is_non_ref() | hl.is_defined(mt.END)))
    else:
        mt = mt.filter_rows(hl.agg.any(mt[gt_expr].is_non_ref()))

    logger.info(
        "Finished subsetting samples. Kept %d out of %d samples in MT",
        mt.count_cols(),
        full_count,
    )
    return mt
Exemplo n.º 30
0
def subset_samples(
    input_mt: hl.MatrixTable,
    pedigree: hl.Table,
    sex_ht: hl.Table,
    output_dir: str,
    output_name: str,
) -> Tuple[hl.MatrixTable, hl.Table, list, list]:
    """
    Filter the MatrixTable and sex Table to only samples in the pedigree.

    :param input_mt: MatrixTable
    :param pedigree: Pedigree file from seqr loaded as a Hail Table
    :param sex_ht: Table of inferred sexes for each sample
    :param output_dir: Path to directory to output results
    :param output_name: Output prefix to use for results
    :return: MatrixTable and sex ht subsetted to the samples given in the pedigree, list of samples in the pedigree, list of samples in the VCF
    """
    # Get sample names to subset from the pedigree
    samples_to_subset = hl.set(pedigree.Individual_ID.collect())
    # Subset mt and ht to samples in the pedigree
    mt_subset = input_mt.filter_cols(samples_to_subset.contains(input_mt["s"]))
    sex_ht = sex_ht.filter(samples_to_subset.contains(sex_ht["s"]))

    # Filter to variants that have at least one alt call after the subsetting
    mt_subset = mt_subset.filter_rows(hl.agg.any(mt_subset.GT.is_non_ref()))

    # Check that the samples in the pedigree are present in the VCF subset and output samples that are missing
    out_missing_samples = hl.hadoop_open(
        f"{output_dir}/{output_name}_missing_samples_in_subset.txt", "w")
    expected_samples = pedigree.Individual_ID.collect()
    vcf_samples = mt_subset.s.collect()

    missings = set(expected_samples) - set(vcf_samples)
    for i in missings:
        out_missing_samples.write(i + "\n")
    out_missing_samples.close()

    return (mt_subset, sex_ht, expected_samples, vcf_samples)
Exemplo n.º 31
0
def explode_trio_matrix(tm: hl.MatrixTable, col_keys: List[str] = ['s'], keep_trio_cols: bool = True, keep_trio_entries: bool = False) -> hl.MatrixTable:
    """Splits a trio MatrixTable back into a sample MatrixTable.

    Example
    -------
    >>> # Create a trio matrix from a sample matrix
    >>> pedigree = hl.Pedigree.read('data/case_control_study.fam')
    >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True)

    >>> # Explode trio matrix back into a sample matrix
    >>> exploded_trio_dataset = explode_trio_matrix(trio_dataset)

    Notes
    -----
    The resulting MatrixTable column schema is the same as the proband/father/mother schema,
    and the resulting entry schema is the same as the proband_entry/father_entry/mother_entry schema.
    If the `keep_trio_cols` option is set, then an additional `source_trio` column is added with the trio column data.
    If the `keep_trio_entries` option is set, then an additional `source_trio_entry` column is added with the trio entry data.

    Note
    ----
    This assumes that the input MatrixTable is a trio MatrixTable (similar to the result of :meth:`.methods.trio_matrix`)
    Its entry schema has to contain 'proband_entry`, `father_entry` and `mother_entry` all with the same type.
    Its column schema has to contain 'proband`, `father` and `mother` all with the same type.

    Parameters
    ----------
    tm : :class:`.MatrixTable`
        Trio MatrixTable (entries have to be a Struct with `proband_entry`, `mother_entry` and `father_entry` present)
    col_keys : :obj:`list` of str
        Column key(s) for the resulting sample MatrixTable
    keep_trio_cols: bool
        Whether to add a `source_trio` column with the trio column data (default `True`)
    keep_trio_entries: bool
        Whether to add a `source_trio_entries` column with the trio entry data (default `False`)

    Returns
    -------
    :class:`.MatrixTable`
        Sample MatrixTable"""

    select_entries_expr = {'__trio_entries': hl.array([tm.proband_entry, tm.father_entry, tm.mother_entry])}
    if keep_trio_entries:
        select_entries_expr['source_trio_entry'] = hl.struct(**tm.entry)
    tm = tm.select_entries(**select_entries_expr)

    tm = tm.key_cols_by()
    select_cols_expr = {'__trio_members': hl.zip_with_index(hl.array([tm.proband, tm.father, tm.mother]))}
    if keep_trio_cols:
        select_cols_expr['source_trio'] = hl.struct(**tm.col)
    tm = tm.select_cols(**select_cols_expr)

    mt = tm.explode_cols(tm.__trio_members)

    mt = mt.transmute_entries(
        **mt.__trio_entries[mt.__trio_members[0]]
    )

    mt = mt.key_cols_by()
    mt = mt.transmute_cols(**mt.__trio_members[1])

    if col_keys:
        mt = mt.key_cols_by(*col_keys)

    return mt
Exemplo n.º 32
0
def explode_trio_matrix(tm: hl.MatrixTable, col_keys: List[str] = ['s']) -> hl.MatrixTable:
    """Splits a trio MatrixTable back into a sample MatrixTable.

    Example
    -------
    >>> # Create a trio matrix from a sample matrix
    >>> pedigree = hl.Pedigree.read('data/case_control_study.fam')
    >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True)

    >>> # Explode trio matrix back into a sample matrix
    >>> exploded_trio_dataset = explode_trio_matrix(trio_dataset)

    Notes
    -----
    This assumes that the input MatrixTable is a trio MatrixTable (similar to the result of :meth:`.methods.trio_matrix`)
    In particular, it should have the following entry schema:
    - proband_entry
    - father_entry
    - mother_entry
    And the following column schema:
    - proband
    - father
    - mother

    Note
    ----
    The only entries kept are `proband_entry`, `father_entry` and `mother_entry` are dropped.
    The only columns kepy are `proband`, `father` and `mother`

    Parameters
    ----------
    tm : :class:`.MatrixTable`
        Trio MatrixTable (entries have to be a Struct with `proband_entry`, `mother_entry` and `father_entry` present)
    call_field : :obj:`list` of str
        Column key(s) for the resulting sample MatrixTable

    Returns
    -------
    :class:`.MatrixTable`
        Sample MatrixTable"""

    tm = tm.select_entries(
        __trio_entries=hl.array([tm.proband_entry, tm.father_entry, tm.mother_entry])
    )

    tm = tm.select_cols(
        __trio_members=hl.zip_with_index(hl.array([tm.proband, tm.father, tm.mother]))
    )
    mt = tm.explode_cols(tm.__trio_members)

    mt = mt.select_entries(
        **mt.__trio_entries[mt.__trio_members[0]]
    )

    mt = mt.key_cols_by()
    mt = mt.select_cols(**mt.__trio_members[1])

    if col_keys:
        mt = mt.key_cols_by(*col_keys)

    return mt
Exemplo n.º 33
0
def phase_trio_matrix_by_transmission(tm: hl.MatrixTable, call_field: str = 'GT', phased_call_field: str = 'PBT_GT') -> hl.MatrixTable:
    """Adds a phased genoype entry to a trio MatrixTable based allele transmission in the trio.

    Example
    -------
    >>> # Create a trio matrix
    >>> pedigree = hl.Pedigree.read('data/case_control_study.fam')
    >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True)

    >>> # Phase trios by transmission
    >>> phased_trio_dataset = phase_trio_matrix_by_transmission(trio_dataset)

    Notes
    -----
    Uses only a `Call` field to phase and only phases when all 3 members of the trio are present and have a call.

    In the phased genotypes, the order is as follows:
    - Proband: father_allele | mother_allele
    - Parents: transmitted_allele | untransmitted_allele

    Phasing of sex chromosomes:
    - Sex chromosomes of male individuals should be haploid to be phased correctly.
    - If a proband is diploid on non-par regions of the sex chromosomes, it is assumed to be female.

    Genotypes that cannot be phased are set to `NA`.
    The following genotype calls combinations cannot be phased by transmission (all trio members phased calls set to missing):
    1. One of the calls in the trio is missing
    2. The proband genotype cannot be obtained from the parents alleles (Mendelian violation)
    3. All individuals of the trio are heterozygous for the same two alleles
    4. Father is diploid on non-PAR region of X or Y
    5. Proband is diploid on non-PAR region of Y

    In addition, individual phased genotype calls are returned as missing in the following situations:
    1. All mother genotype calls non-PAR region of Y
    2. Diploid father genotype calls on non-PAR region of X for a male proband (proband and mother are still phased as father doesn't participate in allele transmission)

    Parameters
    ----------
    tm : :class:`.MatrixTable`
        Trio MatrixTable (entries have to be a Struct with `proband_entry`, `mother_entry` and `father_entry` present)
    call_field : str
        genotype field name in the matrix entries to use for phasing
    phased_call_field : str
        name for the phased genotype field in the matrix entries

    Returns
    -------
    :class:`.MatrixTable`
        Trio MatrixTable entry with additional phased genotype field for each individual"""

    tm = tm.annotate_entries(
        __phased_GT=phase_by_transmission(
            tm.locus,
            tm.alleles,
            tm.proband_entry[call_field],
            tm.father_entry[call_field],
            tm.mother_entry[call_field]
        )
    )

    return tm.select_entries(
        proband_entry=hl.struct(
            **tm.proband_entry,
            **{phased_call_field: tm.__phased_GT[0]}
        ),
        father_entry=hl.struct(
            **tm.father_entry,
            **{phased_call_field: tm.__phased_GT[1]}
        ),
        mother_entry=hl.struct(
            **tm.mother_entry,
            **{phased_call_field: tm.__phased_GT[2]}
        )
    )
Exemplo n.º 34
0
def full_outer_join_mt(left: hl.MatrixTable, right: hl.MatrixTable) -> hl.MatrixTable:
    """Performs a full outer join on `left` and `right`.

    Replaces row, column, and entry fields with the following:

     - `left_row` / `right_row`: structs of row fields from left and right.
     - `left_col` / `right_col`: structs of column fields from left and right.
     - `left_entry` / `right_entry`: structs of entry fields from left and right.

    Parameters
    ----------
    left : :class:`.MatrixTable`
    right : :class:`.MatrixTable`

    Returns
    -------
    :class:`.MatrixTable`
    """

    if [x.dtype for x in left.row_key.values()] != [x.dtype for x in right.row_key.values()]:
        raise ValueError(f"row key types do not match:\n"
                         f"  left:  {list(left.row_key.values())}\n"
                         f"  right: {list(right.row_key.values())}")

    if [x.dtype for x in left.col_key.values()] != [x.dtype for x in right.col_key.values()]: 
        raise ValueError(f"column key types do not match:\n"
                         f"  left:  {list(left.col_key.values())}\n"
                         f"  right: {list(right.col_key.values())}")

    left = left.select_rows(left_row=left.row)
    left_t = left.localize_entries('left_entries', 'left_cols')
    right = right.select_rows(right_row=right.row)
    right_t = right.localize_entries('right_entries', 'right_cols')

    ht = left_t.join(right_t, how='outer')
    ht = ht.annotate_globals(
        left_keys=hl.group_by(
            lambda t: t[0],
            hl.zip_with_index(
                ht.left_cols.map(lambda x: hl.tuple([x[f] for f in left.col_key])), index_first=False)).map_values(
            lambda elts: elts.map(lambda t: t[1])),
        right_keys=hl.group_by(
            lambda t: t[0],
            hl.zip_with_index(
                ht.right_cols.map(lambda x: hl.tuple([x[f] for f in right.col_key])), index_first=False)).map_values(
            lambda elts: elts.map(lambda t: t[1])))
    ht = ht.annotate_globals(
        key_indices=hl.array(ht.left_keys.key_set().union(ht.right_keys.key_set()))
            .map(lambda k: hl.struct(k=k, left_indices=ht.left_keys.get(k), right_indices=ht.right_keys.get(k)))
            .flatmap(lambda s: hl.case()
                     .when(hl.is_defined(s.left_indices) & hl.is_defined(s.right_indices),
                           hl.range(0, s.left_indices.length()).flatmap(
                               lambda i: hl.range(0, s.right_indices.length()).map(
                                   lambda j: hl.struct(k=s.k, left_index=s.left_indices[i],
                                                       right_index=s.right_indices[j]))))
                     .when(hl.is_defined(s.left_indices),
                           s.left_indices.map(
                               lambda elt: hl.struct(k=s.k, left_index=elt, right_index=hl.null('int32'))))
                     .when(hl.is_defined(s.right_indices),
                           s.right_indices.map(
                               lambda elt: hl.struct(k=s.k, left_index=hl.null('int32'), right_index=elt)))
                     .or_error('assertion error')))
    ht = ht.annotate(__entries=ht.key_indices.map(lambda s: hl.struct(left_entry=ht.left_entries[s.left_index],
                                                                      right_entry=ht.right_entries[s.right_index])))
    ht = ht.annotate_globals(__cols=ht.key_indices.map(
        lambda s: hl.struct(**{f: s.k[i] for i, f in enumerate(left.col_key)},
                            left_col=ht.left_cols[s.left_index],
                            right_col=ht.right_cols[s.right_index])))
    ht = ht.drop('left_entries', 'left_cols', 'left_keys', 'right_entries', 'right_cols', 'right_keys', 'key_indices')
    return ht._unlocalize_entries('__entries', '__cols', list(left.col_key))