예제 #1
0
def flatten_phased_ht(phased_ht: hl.Table) -> hl.Table:
    phased_ht = phased_ht.key_by()

    # phase_ht = phase_ht.key_by()

    def flatten_phase_dict(
            expr: hl.expr.StructExpression) -> hl.expr.StructExpression:
        return hl.struct(
            raw=flatten_gt_counts(expr.gt_counts.raw),
            adj=flatten_gt_counts(expr.gt_counts.adj),
            em_p_chet_raw=expr.em.raw.p_chet,
            em_p_chet_adj=expr.em.adj.p_chet,
            # em1_p_chet_raw=expr.em_plus_one.raw.p_chet,
            # em1_p_chet_adj=expr.em_plus_one.adj.p_chet
        )

    return phased_ht.transmute(
        chrom=phased_ht.locus1.contig,
        pos1=phased_ht.locus1.position,
        ref1=phased_ht.alleles1[0],
        alt1=phased_ht.alleles1[1],
        pos2=phased_ht.locus2.position,
        ref2=phased_ht.alleles2[0],
        alt2=phased_ht.alleles2[1],
        **{k: v
           for k, v in flatten_phase_dict(phased_ht.phase_info).items()
           }).flatten()
예제 #2
0
def check_sex(
    sex_ht: hl.Table,
    output_dir: str,
    output_name: str,
) -> None:
    """
    Compare inferred to given sex and output file with column added for discrepancies.

    Output directory and name here are used to locate the functioning pedigree with given sexes.

    :param sex_ht: Table of inferred sexes for each sample
    :param output_dir: Path to directory to output results
    :param output_name: Output prefix to use for results
    :return: None
    """
    # Read in functioning pedigree with given sexes
    ped_ht = hl.import_table(
        f"{output_dir}/{output_name}_functioning_pedigree.ped")
    ped_ht = ped_ht.key_by(s=ped_ht.Individual_ID).select("Sex")

    ped_ht = ped_ht.annotate(
        given_sex=hl.case().when(ped_ht.Sex == "M", "male").when(
            ped_ht.Sex == "F", "female").default(ped_ht.Sex)).drop("Sex")

    sex_ht = sex_ht.join(ped_ht, how="outer")
    sex_ht = sex_ht.annotate(discrepant_sex=sex_ht.sex != sex_ht.given_sex)
    sex_ht.export(f"{output_dir}/{output_name}_sex_check.txt")
예제 #3
0
def apply_rf_model(ht: hl.Table,
                   rf_model: pyspark.ml.PipelineModel,
                   features: List[str],
                   label: str,
                   probability_col_name: str = 'rf_probability',
                   prediction_col_name: str = 'rf_prediction') -> hl.Table:
    """
    Applies a Random Forest (RF) pipeline model to a Table and annotate the RF probabilities and predictions.

    :param MatrixTable ht: Input HT
    :param PipelineModel rf_model: Random Forest pipeline model
    :param list of str features: List of feature columns in the pipeline. !Should match the model list of features!
    :param str label: Column containing the labels. !Should match the model labels!
    :param str probability_col_name: Name of the column that will store the RF probabilities
    :param str prediction_col_name: Name of the column that will store the RF predictions
    :return: Table with RF columns
    :rtype: Table
    """

    logger.info("Applying RF model.")

    check_ht_fields_for_spark(ht, features + [label])

    index_name = 'rf_idx'
    while index_name in ht.row:
        index_name += '_tmp'
    ht = ht.add_index(name=index_name)

    ht_keys = ht.key
    ht = ht.key_by(index_name)

    df = ht_to_rf_df(ht, features, label, index_name)

    rf_df = rf_model.transform(df)

    def to_array(col):
        def to_array_(v):
            return v.toArray().tolist()

        return udf(to_array_, ArrayType(DoubleType()))(col)

    rf_ht = hl.Table.from_spark(
        rf_df.withColumn("probability", to_array(col("probability"))).select(
            [index_name, 'probability', 'predictedLabel'])).persist()

    rf_ht = rf_ht.key_by(index_name)

    ht = ht.annotate(
        **{
            probability_col_name: {
                label: rf_ht[ht[index_name]]["probability"][i]
                for i, label in enumerate(get_labels(rf_model))
            },
            prediction_col_name: rf_ht[ht[index_name]]["predictedLabel"]
        })

    ht = ht.key_by(*ht_keys)
    ht = ht.drop(index_name)

    return ht
예제 #4
0
def annotate_from_dict(ht: hl.Table, dict_field: str,
                       output_filed: str) -> hl.Table:
    """
    Expand an dict field and add new fields.

    :param ht: HailTable
    :param dict_field: The dict field to be expanded
    :param output_filed: The output filed name (annotated as structure)
    :return: Annotated HailTable
    """

    # retrieve dict keys to be annotated as fields
    dict_keys = ht[dict_field].keys().take(1)[0]

    # structure annotation expression
    struct_expr = hl.struct(
        **{
            dict_keys[i]: ht[dict_field].get(dict_keys[i])
            for i in range(len(dict_keys))
        })

    ht = (ht.annotate(_tmp_field_=struct_expr))
    ht = ht.rename({'_tmp_field_': output_filed})

    return ht
예제 #5
0
def compute_phase(variants_ht: hl.Table,
                  least_consequence: str = LEAST_CONSEQUENCE,
                  max_freq: float = MAX_FREQ) -> hl.Table:
    n_variant_pairs = variants_ht.count()
    logger.info(f"Looking up phase for {n_variant_pairs} variant pair(s).")

    # Join with gnomad phased variants
    vp_ht = hl.read_table(phased_vp_count_ht_path('exomes'))
    phased_ht = vp_ht.semi_join(variants_ht)
    n_phased = phased_ht.count()
    phased_ht = explode_phase_info(phased_ht)  # explodes phase_info by pop
    phased_ht = phased_ht.transmute(
        phase_info=phased_ht.phase_info.select('gt_counts', 'em')).repartition(
            ceil(n_variant_pairs / 10000), shuffle=True)
    phased_ht = phased_ht.persist()  # .checkpoint("gs://gnomad-tmp/vp_ht.ht")

    # If not all pairs had at least one carrier of both, then compute phase estimate from single variants
    logger.info(
        f"{n_phased}/{n_variant_pairs} variant pair(s) found with carriers of both in gnomAD."
    )

    if n_phased < n_variant_pairs:
        unphased_ht = variants_ht.anti_join(vp_ht)
        unphased_ht = annotate_unphased_pairs(unphased_ht, n_variant_pairs,
                                              least_consequence, max_freq)
        phased_ht = phased_ht.union(unphased_ht, unify=True)

    return phased_ht
예제 #6
0
def filter_kin_ht(
    ht: hl.Table,
    out_summary: io.TextIOWrapper,
    first_degree_pi_hat: float = 0.40,
    grandparent_pi_hat: float = 0.20,
    grandparent_ibd1: float = 0.25,
    grandparent_ibd2: float = 0.15,
) -> hl.Table:
    """
    Filter the kinship table to relationships of grandparents and above.

    :param ht: hl.Table
    :param out_summary: Summary file with a summary statistics and notes
    :param first_degree_pi_hat: Minimum pi_hat threshold to use to filter the kinship table to first degree relatives
    :param grandparent_pi_hat: Minimum pi_hat threshold to use to filter the kinship table to grandparents
    :param grandparent_ibd1: Minimum IBD1 threshold to use to filter the kinship table to grandparents
    :param grandparent_ibd2: Maximum IBD2 threshold to use to filter the kinship table to grandparents
    :return: Table containing only relationships of grandparents and above
    """
    # Filter to anything above the relationship of a grandparent
    ht = ht.filter((ht.pi_hat > first_degree_pi_hat)
                   | ((ht.pi_hat > grandparent_pi_hat)
                      & (ht.ibd1 > grandparent_ibd1)
                      & (ht.ibd2 < grandparent_ibd2)))
    ht = ht.annotate(pair=hl.sorted([ht.i, ht.j]))

    out_summary.write(
        f"NOTE: kinship table was filtered to:\n(kin > {first_degree_pi_hat}) or kin > {grandparent_pi_hat} and IBD1 > {grandparent_ibd1} and IBD2 > {grandparent_ibd2})\n"
    )
    out_summary.write(
        f"relationships not meeting this critera were not evaluated\n\n")

    return ht
예제 #7
0
def generate_allele_data(ht: hl.Table) -> hl.Table:
    """
    Returns bi-allelic sites HT with the following annotations:
     - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles)

    :param Table ht: Full unsplit HT
    :return: Table with allele data annotations
    :rtype: Table
    """
    ht = ht.select()
    allele_data = hl.struct(nonsplit_alleles=ht.alleles,
                            has_star=hl.any(lambda a: a == "*", ht.alleles))
    ht = ht.annotate(allele_data=allele_data.annotate(
        **add_variant_type(ht.alleles)))

    ht = hl.split_multi_hts(ht)
    ht = ht.filter(hl.len(ht.alleles) > 1)
    allele_type = (hl.case().when(
        hl.is_snp(ht.alleles[0], ht.alleles[1]),
        "snv").when(hl.is_insertion(ht.alleles[0], ht.alleles[1]),
                    "ins").when(hl.is_deletion(ht.alleles[0], ht.alleles[1]),
                                "del").default("complex"))
    ht = ht.annotate(allele_data=ht.allele_data.annotate(
        allele_type=allele_type,
        was_mixed=ht.allele_data.variant_type == "mixed"))
    return ht
예제 #8
0
def combine(ts):
    def merge_alleles(alleles):
        from hail.expr.functions import _num_allele_type, _allele_ints
        return hl.rbind(
            alleles.map(lambda a: hl.or_else(a[0], '')).fold(
                lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''),
            lambda ref: hl.rbind(
                alleles.map(lambda al: hl.rbind(
                    al[0], lambda r: hl.array([ref]).
                    extend(al[1:].map(lambda a: hl.rbind(
                        _num_allele_type(r, a), lambda at: hl.cond(
                            (_allele_ints['SNP'] == at)
                            | (_allele_ints['Insertion'] == at)
                            | (_allele_ints['Deletion'] == at)
                            | (_allele_ints['MNP'] == at)
                            | (_allele_ints['Complex'] == at), a + ref[hl.len(
                                r):], a)))))), lambda lal: hl.
                struct(globl=hl.array([ref]).extend(
                    hl.array(hl.set(hl.flatten(lal)).remove(ref))),
                       local=lal)))

    def renumber_entry(entry, old_to_new) -> StructExpression:
        # global index of alternate (non-ref) alleles
        return entry.annotate(LA=entry.LA.map(lambda lak: old_to_new[lak]))

    if (ts.row.dtype, ts.globals.dtype) not in _merge_function_map:
        f = hl.experimental.define_function(
            lambda row, gbl: hl.rbind(
                merge_alleles(row.data.map(lambda d: d.alleles)), lambda
                alleles: hl.struct(
                    locus=row.locus,
                    alleles=alleles.globl,
                    rsid=hl.find(hl.is_defined, row.data.map(lambda d: d.rsid)
                                 ),
                    __entries=hl.bind(
                        lambda combined_allele_index: hl.
                        range(0, hl.len(row.data)).flatmap(lambda i: hl.cond(
                            hl.is_missing(row.data[i].__entries),
                            hl.range(0, hl.len(gbl.g[i].__cols)).map(
                                lambda _: hl.null(row.data[i].__entries.dtype.
                                                  element_type)),
                            hl.bind(
                                lambda old_to_new: row.data[i].__entries.map(
                                    lambda e: renumber_entry(e, old_to_new)),
                                hl.range(0, hl.len(alleles.local[i])).map(
                                    lambda j: combined_allele_index[
                                        alleles.local[i][j]])))),
                        hl.dict(
                            hl.range(0, hl.len(alleles.globl)).map(
                                lambda j: hl.tuple([alleles.globl[j], j])))))),
            ts.row.dtype, ts.globals.dtype)
        _merge_function_map[(ts.row.dtype, ts.globals.dtype)] = f
    merge_function = _merge_function_map[(ts.row.dtype, ts.globals.dtype)]
    ts = Table(
        TableMapRows(
            ts._tir,
            Apply(merge_function._name, merge_function._ret_type,
                  TopLevelReference('row'), TopLevelReference('global'))))
    return ts.transmute_globals(
        __cols=hl.flatten(ts.g.map(lambda g: g.__cols)))
예제 #9
0
def explode_duplicate_samples_ht(dups_ht: hl.Table) -> hl.Table:
    """
    Explodes the result of `get_duplicated_samples_ht`, so that each line contains a single sample.
    An additional annotation is added: `dup_filtered` indicating which of the duplicated samples was kept.
    Requires a field `filtered` which type should be the same as the input duplicated samples Table key.

    :param dups_ht: Input HT
    :return: Flattened HT
    """
    def get_dups_to_keep_expr():
        if dups_ht.filtered.dtype.element_type == dups_ht.key.dtype:
            return (dups_ht.key, False)
        elif (len(dups_ht.key) == 1) & (dups_ht.filtered.dtype.element_type
                                        == dups_ht.key[0].dtype):
            return (dups_ht.key[0], False)
        else:
            raise TypeError(
                f"Cannot explode table as types of the filtered field ({dups_ht.filtered.dtype}) and the key ({dups_ht.key.dtype}) are incompatible."
            )

    dups_ht = dups_ht.annotate(dups=hl.array([get_dups_to_keep_expr()]).extend(
        dups_ht.filtered.map(lambda x: (x, True))))
    dups_ht = dups_ht.explode("dups")
    dups_ht = dups_ht.key_by()
    return dups_ht.select(s=dups_ht.dups[0],
                          dup_filtered=dups_ht.dups[1]).key_by("s")
예제 #10
0
def join_tables(ht: hl.Table, exomes: bool) -> hl.Table:
    '''
    Joins seqr variant table to gnomAD table. NOTE code was written assuming most recent gnomAD release is v3

    :param Table ht: Table with variants downloaded from seqr
    :param bool exomes: Whether to join with gnomAD exomes table or genomes table
    :return: seqr variants Table joined with gnomAD table
    :rtype: hl.Table
    '''
    if exomes:
        # read in exomes table
        gnomad_ht = hl.read_table(
            get_gnomad_liftover_data_path('exomes', version='2.1.1'))
        gnomad_ht = gnomad_ht.select('freq', 'popmax')
        gnomad_ht = gnomad_ht.select_globals()
        gnomad_ht = gnomad_ht.transmute(
            gnomad_exomes_AC=gnomad_ht.freq[0].AC,
            gnomad_exomes_AN=gnomad_ht.freq[0].AN,
            gnomad_exomes_popmax_AF=gnomad_ht.popmax[0].AF,
            gnomad_exomes_popmax_pop=gnomad_ht.popmax[0].pop)
        gnomad_ht.describe()
    else:
        # read in genomes table
        gnomad_ht = hl.read_table(
            'gs://gnomad-public/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht'
        )
        gnomad_ht = gnomad_ht.select('freq')
        gnomad_ht = gnomad_ht.transmute(gnomad_genomes_AC=gnomad_ht.freq[0].AC,
                                        gnomad_genomes_AN=gnomad_ht.freq[0].AN)
        gnomad_ht = gnomad_ht.select_globals()
        gnomad_ht.describe()

    ht = ht.annotate(**gnomad_ht[ht.key])
    ht.describe()
    return ht
예제 #11
0
def make_sample_rank_table(phe_ht: hl.Table) -> hl.Table:
    """
    Make table with rank of sample sorted by retention priority
    (lower rank has higher priority).
    It mainly uses two bits of information:
      - cases are prioritised over controls
      - samples are preferred based on the cohort info as follow: chd > ddd > ukbb
    :param phe_ht: Table with sample meta-data annotations (e.g. phenotype, cohort info...)
    :return: Hail Table
    """

    phe_ht = (
        phe_ht.annotate(
            case_control_rank=hl.int(
                phe_ht['phe.is_case']),  # 0: control, 1: cases
            cohort_rank=hl.case().when(phe_ht.is_ukbb, 10).when(
                phe_ht.is_ddd, 100).when(phe_ht.is_chd,
                                         1000).or_missing()).key_by())

    phe_ht = (phe_ht.select('ega_id', 'case_control_rank', 'cohort_rank'))

    # sort table (descending)
    tb_rank = (phe_ht.order_by(hl.desc(phe_ht.case_control_rank),
                               hl.desc(phe_ht.cohort_rank)))

    tb_rank = (tb_rank.add_index(name='rank').key_by('ega_id'))

    tb_rank = tb_rank.annotate(rank=tb_rank.rank + 1)

    return tb_rank
예제 #12
0
def add_global_af(ht: hl.Table, temp: str) -> hl.Table:
    '''
    Adds gnomAD global AF annotation to Table

    :param Table ht: Input Table
    :param str temp: Path to temp bucket (to store intermediary files)
    :return: Table with gnomAD global AF annotation
    :rtype: Table
    '''
    # checkpoint table after completing both gnomAD exomes and gnomAD genomes join
    temp_path = f'{temp}/join.ht'
    ht = ht.checkpoint(temp_path)

    # set gnomAD ACs and ANs to 0 if they are missing after the join
    ht = ht.transmute(
        gnomad_exomes_AC=hl.if_else(hl.is_defined(ht.gnomad_exomes_AC),
                                    ht.gnomad_exomes_AC, 0),
        gnomad_genomes_AC=hl.if_else(hl.is_defined(ht.gnomad_genomes_AC),
                                     ht.gnomad_genomes_AC, 0),
        gnomad_exomes_AN=hl.if_else(hl.is_defined(ht.gnomad_exomes_AN),
                                    ht.gnomad_exomes_AN, 0),
        gnomad_genomes_AN=hl.if_else(hl.is_defined(ht.gnomad_genomes_AN),
                                     ht.gnomad_genomes_AN, 0),
    )

    ht = ht.annotate(gnomad_global_AF=(
        hl.if_else(((ht.gnomad_exomes_AN == 0)
                    & (ht.gnomad_genomes_AN == 0)), 0.0,
                   hl.float((ht.gnomad_exomes_AC + ht.gnomad_genomes_AC) /
                            (ht.gnomad_exomes_AN + ht.gnomad_genomes_AN)))))
    ht.describe()
    return ht
def prepare_exomes(exome_ht: hl.Table, groupings: List, impose_high_af_cutoff_upfront: bool = True) -> hl.Table:

    # Manipulate VEP annotations and explode by them
    exome_ht = add_most_severe_csq_to_tc_within_ht(exome_ht)
    exome_ht = exome_ht.transmute(transcript_consequences=exome_ht.vep.transcript_consequences)
    exome_ht = exome_ht.explode(exome_ht.transcript_consequences)
    
    # Annotate variants with grouping variables. 
    exome_ht, grouping = annotate_constraint_groupings(exome_ht,groupings) # This function needs to be adapted
    exome_ht = exome_ht.select(
        'context', 'ref', 'alt', 'methylation_level', 'freq', 'pass_filters', *groupings)

    # Filter by allele count
    # Likely to need to adapt this function as well
    af_cutoff = 0.001
    freq_index = exome_ht.freq_index_dict.collect()[0][dataset]

    def keep_criteria(ht):
        crit = (ht.freq[freq_index].AC > 0) & ht.pass_filters & (ht.coverage > 0)
        if impose_high_af_cutoff_upfront:
            crit &= (ht.freq[freq_index].AF <= af_cutoff)
        return crit

    exome_ht = exome_ht.filter(keep_criteria(exome_ht))
    return exome_ht
예제 #14
0
def annotate_relatedness(
    relatedness_ht: hl.Table,
    first_degree_kin_thresholds: float = (0.1767767, 0.4),
    second_degree_kin_cutoff: float = 0.1,
    ibd0_0_max: float = 0.05,
) -> hl.Table:
    relatedness_ht = relatedness_ht.annotate(
        relationship=get_relationship_expr(
            kin_expr=relatedness_ht.kin,
            ibd0_expr=relatedness_ht.ibd0,
            ibd1_expr=relatedness_ht.ibd1,
            ibd2_expr=relatedness_ht.ibd2,
            first_degree_kin_thresholds=tuple(first_degree_kin_thresholds),
            second_degree_min_kin=second_degree_kin_cutoff,
            ibd0_0_max=ibd0_0_max,
        )
    )
    relatedness_ht = relatedness_ht.annotate_globals(
        min_individual_maf=0.01,
        min_emission_kinship=0.05,
        ibd0_0_max=ibd0_0_max,
        second_degree_kin_cutoff=second_degree_kin_cutoff,
        first_degree_kin_thresholds=tuple(first_degree_kin_thresholds),
    )
    return relatedness_ht
예제 #15
0
def add_project_and_family_annotations(ht: hl.Table, seqr_projects: dict,
                                       family_ids: dict) -> hl.Table:
    """
    Add seqr project and family ID annotations to the kinship table.

    :param ht: Hail Table of kinship values
    :param seqr_projects: Dictionary of seqr projects for each sample
    :param family_ids: Dictionary of family ids for each sample
    :return: Table with seqr project and family id annotations added
    """
    # Add annotation for seqr projects of sample i and sample j
    hl_seqr_projects = hl.literal(seqr_projects)
    ht = ht.annotate(
        seqr_proj_i=hl_seqr_projects.get(ht.i),
        seqr_proj_j=hl_seqr_projects.get(ht.j),
    )

    # Add annotation for family ids of sample i and sample j
    hl_family_ids = hl.literal(family_ids)
    ht = ht.annotate(
        fam_id_i=hl_family_ids.get(ht.i),
        fam_id_j=hl_family_ids.get(ht.j),
    )

    return ht
예제 #16
0
def assign_platform_from_pcs(
    platform_pca_scores_ht: hl.Table,
    pc_scores_ann: str = "scores",
    hdbscan_min_cluster_size: Optional[int] = None,
    hdbscan_min_samples: int = None,
) -> hl.Table:
    """
    Assigns platforms using HBDSCAN on the results of call rate PCA.
    :param platform_pca_scores_ht: Input table with the PCA score for each sample
    :param pc_scores_ann: Field containing the scores
    :param hdbscan_min_cluster_size: HDBSCAN `min_cluster_size` parameter. If not specified the smallest of 500 and 0.1*n_samples will be used.
    :param hdbscan_min_samples: HDBSCAN `min_samples` parameter
    :return: A Table with a `qc_platform` annotation containing the platform based on HDBSCAN clustering
    """

    logger.info("Assigning platforms based on platform PCA clustering")

    # Read and format data for clustering
    data = platform_pca_scores_ht.to_pandas()
    callrate_data = np.matrix(data[pc_scores_ann].tolist())
    logger.info("Assigning platforms to {} samples.".format(
        len(callrate_data)))

    # Cluster data
    if hdbscan_min_cluster_size is None:
        hdbscan_min_cluster_size = min(500, 0.1 * data.shape[0])
    clusterer = hdbscan.HDBSCAN(min_cluster_size=hdbscan_min_cluster_size,
                                min_samples=hdbscan_min_samples)
    cluster_labels = clusterer.fit_predict(callrate_data)
    n_clusters = len(set(cluster_labels)) - (
        -1 in cluster_labels
    )  # NOTE: -1 is the label for noisy (un-classifiable) data points
    logger.info("Found {} unique platforms during platform imputation.".format(
        n_clusters))

    data["qc_platform"] = cluster_labels

    # Note: write pandas dataframe to disk and re-import as HailTable.
    # This a temporary solution until sort the hail's issue with the function 'hl.Table.from_pandas'
    # and different python versions between driver/executors.
    (data.drop(axis=1, labels=pc_scores_ann).to_csv(
        f'{local_dir}/tmp/data_tmp_hdbscan.tsv', index=False, sep='\t'))
    ht_tmp = (hl.import_table(f'{nfs_dir}/tmp/data_tmp_hdbscan.tsv',
                              impute=True).key_by(*platform_pca_scores_ht.key))

    ht = platform_pca_scores_ht.join(ht_tmp)

    # original/elegant solution (TODO: sort issue with 'from_pandas' function)
    # ht = hl.Table.from_pandas(data, key=[*platform_pca_scores_ht.key])

    # expand array structure and annotate scores (PCs) as individual fields.
    # drop array scores field before to export the results.
    n_pcs = len(ht[pc_scores_ann].take(1)[0])
    ht = (ht.annotate(
        **{f'platform_PC{i + 1}': ht[pc_scores_ann][i]
           for i in range(n_pcs)}).drop(pc_scores_ann))

    ht = ht.annotate(qc_platform="platform_" + hl.str(ht.qc_platform))
    return ht
예제 #17
0
def compute_callrate_mt(
    mt: hl.MatrixTable,
    intervals_ht: hl.Table,
    bi_allelic_only: bool = True,
    autosomes_only: bool = True,
    match: bool = True,
) -> hl.MatrixTable:
    """
    Compute a sample/interval MT with each entry containing the call rate for that sample/interval.

    This can be used as input for imputing exome sequencing platforms.

    .. note::

        The input interval HT should have a key of type Interval.
        The resulting table will have a key of the same type as the `intervals_ht` table and
        contain an `interval_info` field containing all non-key fields of the `intervals_ht`.

    :param mt: Input MT
    :param intervals_ht: Table containing the intervals. This table has to be keyed by locus.
    :param bi_allelic_only: If set, only bi-allelic sites are used for the computation
    :param autosomes_only: If set, only autosomal intervals are used.
    :param matches: If set, returns all intervals in intervals_ht that overlap the locus in the input MT.
    :return: Callrate MT
    """
    logger.info("Computing call rate MatrixTable")

    if len(intervals_ht.key) != 1 or not isinstance(
            intervals_ht.key[0], hl.expr.IntervalExpression):
        logger.warning(
            "Call rate matrix computation expects `intervals_ht` with a key of type Interval. Found: %s",
            intervals_ht.key,
        )

    if autosomes_only:
        callrate_mt = filter_to_autosomes(mt)

    if bi_allelic_only:
        callrate_mt = callrate_mt.filter_rows(bi_allelic_expr(callrate_mt))

    intervals_ht = intervals_ht.annotate(_interval_key=intervals_ht.key)
    callrate_mt = callrate_mt.annotate_rows(_interval_key=intervals_ht.index(
        callrate_mt.locus, all_matches=match)._interval_key)

    if match:
        callrate_mt = callrate_mt.explode_rows("_interval_key")

    callrate_mt = callrate_mt.filter_rows(
        hl.is_defined(callrate_mt._interval_key.interval))
    callrate_mt = callrate_mt.select_entries(
        GT=hl.or_missing(hl.is_defined(callrate_mt.GT), hl.struct()))
    callrate_mt = callrate_mt.group_rows_by(
        **callrate_mt._interval_key).aggregate(
            callrate=hl.agg.fraction(hl.is_defined(callrate_mt.GT)))
    intervals_ht = intervals_ht.drop("_interval_key")
    callrate_mt = callrate_mt.annotate_rows(interval_info=hl.struct(
        **intervals_ht[callrate_mt.row_key]))
    return callrate_mt
예제 #18
0
def generate_final_rf_ht(
    ht: hl.Table,
    snp_cutoff: Union[int, float],
    indel_cutoff: Union[int, float],
    inbreeding_coeff_cutoff: float = INBREEDING_COEFF_HARD_CUTOFF,
    determine_cutoff_from_bin: bool = False,
    aggregated_bin_ht: Optional[hl.Table] = None,
    bin_id: Optional[hl.expr.Int32Expression] = None,
) -> hl.Table:
    """
    Prepares finalized RF model given an RF result table from `rf.apply_rf_model` and cutoffs for filtering.
    If `determine_cutoff_from_bin` is True, `aggregated_bin_ht` must be supplied to determine the SNP and indel RF
    probabilities to use as cutoffs from an aggregated quantile bin Table like one created by
    `compute_grouped_binned_ht` in combination with `score_bin_agg`.
    :param ht: RF result table from `rf.apply_rf_model` to prepare as the final RF Table
    :param ac0_filter_expr: Expression that indicates if a variant should be filtered as allele count 0 (AC0)
    :param ts_ac_filter_expr: Expression in `ht` that indicates if a variant is a transmitted singleton
    :param mono_allelic_fiter_expr: Expression indicating if a variant is mono-allelic
    :param snp_cutoff: RF probability or bin (if `determine_cutoff_from_bin` True) to use for SNP variant QC filter
    :param indel_cutoff: RF probability or bin (if `determine_cutoff_from_bin` True) to use for indel variant QC filter
    :param inbreeding_coeff_cutoff: InbreedingCoeff hard filter to use for variants
    :param determine_cutoff_from_bin: If True RF probability will be determined using bin info in `aggregated_bin_ht`
    :param aggregated_bin_ht: File with aggregate counts of variants based on quantile bins
    :param bin_id: Name of bin to use in 'bin_id' column of `aggregated_bin_ht` to use to determine probability cutoff
    :return: Finalized random forest Table annotated with variant filters
    """
    # Determine SNP and indel RF cutoffs if given bin instead of RF probability

    snp_cutoff_global = hl.struct(min_score=snp_cutoff)
    indel_cutoff_global = hl.struct(min_score=indel_cutoff)

    # Add filters to RF HT
    filters = dict()

    if ht.any(hl.is_missing(ht.rf_probability["TP"])):
        raise ValueError("Missing RF probability!")

    filters["RF"] = (
        hl.is_snp(ht.alleles[0], ht.alleles[1])
        & (ht.rf_probability["TP"] < snp_cutoff_global.min_score)) | (
            ~hl.is_snp(ht.alleles[0], ht.alleles[1])
            & (ht.rf_probability["TP"] < indel_cutoff_global.min_score))

    # Fix annotations for release
    annotations_expr = {
        "rf_positive_label": hl.or_else(ht.tp, False),
        "rf_negative_label": ht.fail_hard_filters,
        "rf_probability": ht.rf_probability["TP"],
    }

    ht = ht.transmute(filters=add_filters_expr(filters=filters),
                      **annotations_expr)

    ht = ht.annotate_globals(rf_snv_cutoff=snp_cutoff_global,
                             rf_indel_cutoff=indel_cutoff_global)

    return ht
예제 #19
0
def default_generate_sib_stats(
    mt: hl.MatrixTable,
    relatedness_ht: hl.Table,
    sex_ht: hl.Table,
    i_col: str = "i",
    j_col: str = "j",
    relationship_col: str = "relationship",
) -> hl.Table:
    """
    This is meant as a default wrapper for `generate_sib_stats_expr`. It returns a hail table with counts of variants
    shared by pairs of siblings in `relatedness_ht`.

    This function takes a hail Table with a row for each pair of individuals i,j in the data that are related (it's OK to have unrelated samples too).
    The `relationship_col` should be a column specifying the relationship between each two samples as defined by
    the constants in `gnomad.utils.relatedness`. This relationship_col will be used to filter to only pairs of
    samples that are annotated as `SIBLINGS`.

    :param mt: Input Matrix table
    :param relatedness_ht: Input relationship table
    :param sex_ht: A Table containing sex information for the samples
    :param i_col: Column containing the 1st sample of the pair in the relationship table
    :param j_col: Column containing the 2nd sample of the pair in the relationship table
    :param relationship_col: Column containing the relationship for the sample pair as defined in this module constants.
    :return: A Table with the sibling shared variant counts
    """
    sex_ht = sex_ht.annotate(
        is_female=hl.case()
        .when(sex_ht.sex_karyotype == "XX", True)
        .when(sex_ht.sex_karyotype == "XY", False)
        .or_missing()
    )

    # TODO: Change to use SIBLINGS constant when relatedness PR goes in
    sib_ht = relatedness_ht.filter(relatedness_ht[relationship_col] == "Siblings")
    s_to_keep = sib_ht.aggregate(
        hl.agg.explode(
            lambda s: hl.agg.collect_as_set(s), [sib_ht[i_col].s, sib_ht[j_col].s]
        ),
        _localize=False,
    )
    mt = mt.filter_cols(s_to_keep.contains(mt.s))
    mt = annotate_adj(mt)

    mt = mt.annotate_cols(is_female=sex_ht[mt.s].is_female)

    sib_stats_ht = mt.select_rows(
        **generate_sib_stats_expr(
            mt,
            sib_ht,
            i_col=i_col,
            j_col=j_col,
            strata={"raw": True, "adj": mt.adj},
            is_female=mt.is_female,
        )
    ).rows()

    return sib_stats_ht
예제 #20
0
def explode_phase_info(ht: hl.Table, remove_all_ref: bool = True) -> hl.Table:
    ht = ht.transmute(phase_info=hl.array(ht.phase_info))
    ht = ht.explode('phase_info')
    ht = ht.transmute(pop=ht.phase_info[0], phase_info=ht.phase_info[1])

    if remove_all_ref:
        ht = ht.filter(hl.sum(ht.phase_info.gt_counts.raw[1:]) > 0)

    return ht
예제 #21
0
def get_known_populations(
        ht: hl.Table,
        pop: str):  # TODO: bring data into separate file and load here
    if pop == 'eur' or pop == 'nfe':
        known_pops = hl.literal({
            'ICR1000': 'nwe',  # gb, ICR
            'ICR142': 'nwe',  # gb, ICR
            'C1017': 'seu',  # it, ATVB
            'C1568': 'seu',  # es, Regicor
            'Bulgarian_Trios': 'bgr',  # Bulgarians
            'C533': 'bgr',  # Bulgarians
            'C821': 'bgr',  # Bulgarians
            'C952': 'bgr',  # Bulgarians
            'G89634': 'est',  # Estonians
            'G94980': 'est',  # Estonians
            # 'C1830': 'neu',  # gb, Leicester
            'C1972': 'nwe',  # gb, Tayside region of Scotland
            # 'G94051': 'seu',  # es, "United States and Spain"
            # 'C1708': 'deu',  # Germans?
            'C1508': 'swe',  # Swedes
            'C1509': 'swe',  # Swedes
        })
    elif pop == 'eas':
        known_pops = hl.literal({
            'C1397': 'oea',  # 'twn',  # Taiwanese trios
            'C1443': 'oea',  # 'twn',  # Taiwanese trios
            'C1506': 'oea',  # 'twn',  # Taiwanese trios
            'C1867': 'oea',  # 'twn',  # Taiwanese trios
            'C978': 'oea',  # 'twn',  # Taiwanese trios
            'C774': 'kor',  # Korean T2D project
            'C1982': 'kor',  # Korean
            'C1940': 'oea',  # 'sgp',  # Singapore
            'C1980': 'oea',  # 'hkg',  # Hong Kong
            '1kg_JPT': 'jpn'
        })
    elif pop == 'afr':
        known_pops = hl.literal({
            'C773': 't2d',  # African American T2D
            'C1002': 't2d',  # African American T2D
            'C1567': 'jhs',  # African American JHS
            'C1956': 'biome',  # African American BioMe
            # TODO: Add 1kg populations here
        })
    else:
        raise ValueError('pop must be one of eur, nfe, eas, afr')
    ht = ht.annotate(known_pop=known_pops.get(ht.meta.project_id))
    if pop == 'eur':
        finns = hl.import_table(
            'gs://gnomad/sample_qc/input_meta/source/99percent_finns_plus_AD_IBD_NFID.tsv.bgz',
            impute=True)
        finns = finns.filter(
            finns.percent_finnish > 0.99).key_by('sample_name_in_vcf')
        ht = ht.annotate(
            known_pop=hl.cond(hl.is_defined(finns[ht.s]), 'fin', ht.known_pop))
    return ht
예제 #22
0
def add_release_annotations(ht: hl.Table) -> hl.Table:
    """

    :param Table ht: Table containing meta column annotations for the dataset
    :return: Table containing final 'high_quality' and 'release' sample status annotations
    :rtype: Table
    """
    ht = ht.annotate(high_quality=(hl.len(ht.hard_filters) == 0)
                     & (hl.len(ht.pop_platform_filters) == 0))
    return ht.annotate(release=ht.high_quality & (hl.len(ht.perm_filters) == 0)
                       & ~ht.related)
def add_rank(
    ht: hl.Table,
    score_expr: hl.expr.NumericExpression,
    subrank_expr: Optional[Dict[str, hl.expr.BooleanExpression]] = None,
) -> hl.Table:
    """
    Adds rank based on the `score_expr`. Rank is added for snvs and indels separately.
    If one or more `subrank_expr` are provided, then subrank is added based on all sites for which the boolean expression is true.

    In addition, variant counts (snv, indel separately) is added as a global (`rank_variant_counts`).

    :param ht: input Hail Table containing variants (with QC annotations) to be ranked
    :param score_expr: the Table annotation by which ranking should be scored
    :param subrank_expr: Any subranking to be added in the form name_of_subrank: subrank_filtering_expr
    :return: Table with rankings added
    """

    key = ht.key
    if subrank_expr is None:
        subrank_expr = {}

    temp_expr = {"_score": score_expr}
    temp_expr.update({f"_{name}": expr for name, expr in subrank_expr.items()})
    rank_ht = ht.select(
        **temp_expr, is_snv=hl.is_snp(ht.alleles[0], ht.alleles[1]))

    rank_ht = rank_ht.key_by("_score").persist()
    scan_expr = {
        "rank": hl.cond(
            rank_ht.is_snv,
            hl.scan.count_where(rank_ht.is_snv),
            hl.scan.count_where(~rank_ht.is_snv),
        )
    }
    scan_expr.update(
        {
            name: hl.or_missing(
                rank_ht[f"_{name}"],
                hl.cond(
                    rank_ht.is_snv,
                    hl.scan.count_where(rank_ht.is_snv & rank_ht[f"_{name}"]),
                    hl.scan.count_where(~rank_ht.is_snv & rank_ht[f"_{name}"]),
                ),
            )
            for name in subrank_expr
        }
    )
    rank_ht = rank_ht.annotate(**scan_expr)

    rank_ht = rank_ht.key_by(*key).persist()
    rank_ht = rank_ht.select(*scan_expr.keys())

    ht = ht.annotate(**rank_ht[key])
    return ht
예제 #24
0
def pheno_ht_to_mt(pheno_ht: hl.Table,
                   data_type: str,
                   special_fields: str = ('age', 'sex'),
                   rekey: bool = True):
    """
    Input Hail Table with lots of phenotype row fields, distill into
    MatrixTable with either categorical or continuous data types
    as entries

    :param Table pheno_ht: Input hail Table with phenotypes as row fields
    :param str data_type: one of "categorical" or "continuous"
    :return: Hail MatrixTable with phenotypes as entries
    :rtype: MatrixTable
    """
    if data_type == 'categorical':
        filter_type = {hl.tbool}
        value_type = hl.bool
    else:
        filter_type = {hl.tint, hl.tfloat}
        value_type = hl.float

    special_fields_to_include = []
    fields = set(pheno_ht.row_value)
    for field in special_fields:
        if field in fields:
            fields.remove(field)
            special_fields_to_include.append(field)
    select_fields = {
        x: value_type(pheno_ht[x])
        for x in fields if pheno_ht[x].dtype in filter_type
    }
    pheno_ht = pheno_ht.select(*special_fields_to_include, **select_fields)

    mt = pheno_ht.to_matrix_table_row_major(columns=list(select_fields),
                                            entry_field_name='value',
                                            col_field_name='phesant_pheno')
    if rekey:
        mt = mt.key_cols_by(
            trait_type=data_type,
            phenocode=mt.phesant_pheno.split('_')[0],
            pheno_sex='both_sexes',
            coding=hl.case().when(
                (data_type == 'categorical') &
                (hl.len(mt.phesant_pheno.split('_')) > 1),
                mt.phesant_pheno.split('_', 2)[1]
            )  # TODO: fix to 1 when https://github.com/hail-is/hail/issues/7893 is fixed
            .default(NULL_STR_KEY),
            modifier=hl.case().when(
                (data_type == 'continuous') &
                (hl.len(mt.phesant_pheno.split('_')) > 1),
                mt.phesant_pheno.split('_', 2)[1]
            )  # TODO: fix to 1 when https://github.com/hail-is/hail/issues/7893 is fixed
            .default(NULL_STR_KEY))
    return mt
예제 #25
0
def densify_sites(
    mt: hl.MatrixTable,
    sites_ht: hl.Table,
    last_END_positions_ht: hl.Table,
    semi_join_rows: bool = True,
) -> hl.MatrixTable:
    """
    Creates a dense version of the input sparse MT at the sites in `sites_ht` reading the minimal amount of data required.

    Note that only rows that appear both in `mt` and `sites_ht` are returned.

    :param mt: Input sparse MT
    :param sites_ht: Desired sites to densify
    :param last_END_positions_ht: Table storing positions of the furthest ref block (END tag)
    :param semi_join_rows: Whether to filter the MT rows based on semi-join (default, better if sites_ht is large) or based on filter_intervals (better if sites_ht only contains a few sites)
    :return: Dense MT filtered to the sites in `sites_ht`
    """
    logger.info("Computing intervals to densify from sites Table.")
    sites_ht = sites_ht.key_by("locus")
    sites_ht = sites_ht.annotate(
        interval=hl.locus_interval(
            sites_ht.locus.contig,
            last_END_positions_ht[sites_ht.key].last_END_position,
            end=sites_ht.locus.position,
            includes_end=True,
            reference_genome=sites_ht.locus.dtype.reference_genome,
        )
    )
    sites_ht = sites_ht.filter(hl.is_defined(sites_ht.interval))

    if semi_join_rows:
        mt = mt.filter_rows(hl.is_defined(sites_ht.key_by("interval")[mt.locus]))
    else:
        logger.info("Collecting intervals to densify.")
        intervals = sites_ht.interval.collect()

        print(
            "Found {0} intervals, totalling {1} bp in the dense Matrix.".format(
                len(intervals),
                sum(
                    [
                        interval_length(interval)
                        for interval in union_intervals(intervals)
                    ]
                ),
            )
        )

        mt = hl.filter_intervals(mt, intervals)

    mt = hl.experimental.densify(mt)

    return mt.filter_rows(hl.is_defined(sites_ht[mt.locus]))
예제 #26
0
def compare_row_counts(ht1: hl.Table, ht2: hl.Table) -> bool:
    """
    Check if the row counts in two Tables are the same.

    :param ht1: First Table to be checked
    :param ht2: Second Table to be checked
    :return: Whether the row counts are the same
    """
    r_count1 = ht1.count()
    r_count2 = ht2.count()
    logger.info(f"{r_count1} rows in left table; {r_count2} rows in right table")
    return r_count1 == r_count2
예제 #27
0
def make_hist_bin_edges_expr(
    ht: hl.Table,
    hists: List[str] = HISTS,
    prefix: str = "",
    label_delimiter: str = "_",
    include_age_hists: bool = True,
) -> Dict[str, str]:
    """
    Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string separated by pipe delimiters.

    :param ht: Table containing histogram variant annotations.
    :param hists: List of variant histogram annotations. Default is HISTS.
    :param prefix: Prefix text for age histogram bin edges.  Default is empty string.
    :param label_delimiter: String used as delimiter between prefix and histogram annotation.
    :param include_age_hists: Include age histogram annotations.
    :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values.
    """
    # Add underscore to prefix if it isn't empty
    if prefix != "":
        prefix += label_delimiter

    edges_dict = {}
    if include_age_hists:
        edges_dict.update(
            {
                f"{prefix}{call_type}": "|".join(
                    map(
                        lambda x: f"{x:.1f}",
                        ht.head(1)[f"age_hist_{call_type}"].collect()[0].bin_edges,
                    )
                )
                for call_type in ["het", "hom"]
            }
        )

    for hist in hists:

        # Parse hists calculated on both raw and adj-filtered data
        for hist_type in [f"{prefix}raw_qual_hists", f"{prefix}qual_hists"]:

            hist_name = hist
            if "raw" in hist_type:
                hist_name = f"{prefix}{hist}_raw"

            edges_dict[hist_name] = "|".join(
                map(
                    lambda x: f"{x:.2f}" if "ab" in hist else str(int(x)),
                    ht.head(1)[hist_type][hist].collect()[0].bin_edges,
                )
            )

    return edges_dict
예제 #28
0
    def aggregate_contig(ht: hl.Table, contigs: Set[str] = None):
        """
        Aggregates all contigs together and computes number for bins accross the contigs.
        """
        if contigs:
            ht = ht.filter(hl.literal(contigs).contains(ht.contig))

        return ht.group_by(*[k for k in ht.key if k != 'contig']).aggregate(
            min_score=hl.agg.min(ht.min_score),
            max_score=hl.agg.max(ht.max_score),
            **{
                x: hl.agg.sum(ht[x])
                for x in ht.row_value if x not in ['min_score', 'max_score']
            })
예제 #29
0
def explode_duplicate_samples_ht(dups_ht: hl.Table) -> hl.Table:
    """
    Flattens the result of `filter_duplicate_samples`, so that each line contains a single sample.
    An additional annotation is added: `dup_filtered` indicating which of the duplicated samples was kept.

    :param dups_ht: Input HT
    :return: Flattened HT
    """
    dups_ht = dups_ht.annotate(dups=hl.array([(
        dups_ht.key,
        False)]).extend(dups_ht.filtered.map(lambda x: (x, True))))
    dups_ht = dups_ht.explode('dups')
    dups_ht = dups_ht.key_by()
    return dups_ht.select(s=dups_ht.dups[0],
                          dup_filtered=dups_ht.dups[1]).key_by('s')
예제 #30
0
 def annotate_related_pairs(related_pairs: hl.Table,
                            index_col: str) -> hl.Table:
     related_pairs = related_pairs.key_by(**related_pairs[index_col])
     related_pairs = related_pairs.filter(
         hl.is_missing(case_parents[related_pairs.key]))
     return related_pairs.annotate(
         **{
             index_col:
             related_pairs[index_col].annotate(
                 case_rank=hl.or_else(
                     hl.int(meta_ht[related_pairs.key].is_case), -1),
                 dp_mean=hl.or_else(
                     sample_qc_ht[
                         related_pairs.key].sample_qc.dp_stats.mean, -1.0))
         }).key_by()
예제 #31
0
파일: vcf_combiner.py 프로젝트: jigold/hail
def combine(ts):
    def merge_alleles(alleles):
        from hail.expr.functions import _num_allele_type, _allele_ints
        return hl.rbind(
            alleles.map(lambda a: hl.or_else(a[0], ''))
                   .fold(lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''),
            lambda ref:
            hl.rbind(
                alleles.map(
                    lambda al: hl.rbind(
                        al[0],
                        lambda r:
                        hl.array([ref]).extend(
                            al[1:].map(
                                lambda a:
                                hl.rbind(
                                    _num_allele_type(r, a),
                                    lambda at:
                                    hl.cond(
                                        (_allele_ints['SNP'] == at) |
                                        (_allele_ints['Insertion'] == at) |
                                        (_allele_ints['Deletion'] == at) |
                                        (_allele_ints['MNP'] == at) |
                                        (_allele_ints['Complex'] == at),
                                        a + ref[hl.len(r):],
                                        a)))))),
                lambda lal:
                hl.struct(
                    globl=hl.array([ref]).extend(hl.array(hl.set(hl.flatten(lal)).remove(ref))),
                    local=lal)))

    def renumber_entry(entry, old_to_new) -> StructExpression:
        # global index of alternate (non-ref) alleles
        return entry.annotate(LA=entry.LA.map(lambda lak: old_to_new[lak]))

    if (ts.row.dtype, ts.globals.dtype) not in _merge_function_map:
        f = hl.experimental.define_function(
            lambda row, gbl:
            hl.rbind(
                merge_alleles(row.data.map(lambda d: d.alleles)),
                lambda alleles:
                hl.struct(
                    locus=row.locus,
                    alleles=alleles.globl,
                    rsid=hl.find(hl.is_defined, row.data.map(lambda d: d.rsid)),
                    __entries=hl.bind(
                        lambda combined_allele_index:
                        hl.range(0, hl.len(row.data)).flatmap(
                            lambda i:
                            hl.cond(hl.is_missing(row.data[i].__entries),
                                    hl.range(0, hl.len(gbl.g[i].__cols))
                                      .map(lambda _: hl.null(row.data[i].__entries.dtype.element_type)),
                                    hl.bind(
                                        lambda old_to_new: row.data[i].__entries.map(
                                            lambda e: renumber_entry(e, old_to_new)),
                                        hl.range(0, hl.len(alleles.local[i])).map(
                                            lambda j: combined_allele_index[alleles.local[i][j]])))),
                        hl.dict(hl.range(0, hl.len(alleles.globl)).map(
                            lambda j: hl.tuple([alleles.globl[j], j])))))),
            ts.row.dtype, ts.globals.dtype)
        _merge_function_map[(ts.row.dtype, ts.globals.dtype)] = f
    merge_function = _merge_function_map[(ts.row.dtype, ts.globals.dtype)]
    ts = Table(TableMapRows(ts._tir, Apply(merge_function._name,
                                           TopLevelReference('row'),
                                           TopLevelReference('global'))))
    return ts.transmute_globals(__cols=hl.flatten(ts.g.map(lambda g: g.__cols)))