예제 #1
0
def public_release(data_type: str) -> VersionedTableResource:
    """
    Retrieves publicly released versioned table resource

    :param data_type: One of "exomes" or "genomes"
    :return: Release Table
    """

    if data_type not in DATA_TYPES:
        raise DataException(f"{data_type} not in {DATA_TYPES}")

    if data_type == "exomes":
        current_release = CURRENT_EXOME_RELEASE
        releases = EXOME_RELEASES
    else:
        current_release = CURRENT_GENOME_RELEASE
        releases = GENOME_RELEASES

    return VersionedTableResource(
        current_release,
        {
            release:
            TableResource(path=_public_release_ht_path(data_type, release))
            for release in releases
        },
    )
예제 #2
0
def liftover(data_type: str) -> VersionedTableResource:
    """
    Get the 38 liftover of gnomad v2.1.1

    :param data_type: One of "exomes" or "genomes"
    :return: Release Table
    """
    if data_type not in DATA_TYPES:
        raise DataException(f"{data_type} not in {DATA_TYPES}")

    if data_type == "exomes":
        current_release = CURRENT_EXOME_RELEASE
        releases = EXOME_RELEASES
        releases.remove("2.1")
    else:
        current_release = CURRENT_GENOME_RELEASE
        releases = GENOME_RELEASES

    return VersionedTableResource(
        current_release,
        {
            release:
            TableResource(path=_liftover_data_path(data_type, release))
            for release in releases
        },
    )
예제 #3
0
def get_vqsr_filters(
    model_id: str,
    split: bool = True,
    finalized: bool = False,
) -> VersionedTableResource:
    """
    Gets the specified VQSR filtering annotation resource.

    :param model_id: VQSR filtering model id
    :param split: Split or multi-allelic version of the filtering file
    :param finalized: Whether to return the raw VQSR table or the finalized VQSR table representing determined cutoffs
    :return: VQSR filtering annotation file
    """
    return VersionedTableResource(
        CURRENT_RELEASE,
        {
            release: TableResource("{}/filtering/{}{}{}.ht".format(
                _annotations_root(release),
                model_id,
                ".finalized" if finalized else "",
                ".split" if split else "",
            ))
            for release in RELEASES
        },
    )
예제 #4
0
def get_rf_annotations(adj: bool = False) -> VersionedTableResource:
    """
    Returns the VersionedTableResource to the RF-ready annotated Table

    Annotations that are included in the Table:

        Features for RF:
            - InbreedingCoeff
            - variant_type
            - allele_type
            - n_alt_alleles
            - has_star
            - AS_QD
            - AS_pab_max
            - AS_MQRankSum
            - AS_SOR
            - AS_ReadPosRankSum

        Training sites (bool):
            - transmitted_singleton
            - fail_hard_filters - (ht.QD < 2) | (ht.FS > 60) | (ht.MQ < 30)

    :param bool adj: Whether to load 'adj' or 'raw'
    :return: Table with RF annotations
    """
    return VersionedTableResource(
        CURRENT_RELEASE,
        {
            release: TableResource(
                f"{get_variant_qc_root(release)}/rf/rf_annotations.{'adj' if adj else 'raw'}.ht"
            )
            for release in RELEASES
        },
    )
예제 #5
0
def get_freq(version: str = CURRENT_RELEASE,
             subset: Optional[str] = None) -> VersionedTableResource:
    """
    Get the frequency annotation table for a specified release.

    :param version: Version of annotation path to return
    :param subset: One of the official subsets of the specified release (e.g., non_neuro, non_cancer, controls_and_biobanks)
    :return: Hail Table containing subset or overall cohort frequency annotations
    """
    if version == "3" and subset:
        raise DataException("Subsets of gnomAD v3 do not exist")

    if subset and subset not in SUBSETS:
        raise DataException(
            f"{subset} subset is not one of the following official subsets: {SUBSETS}"
        )

    return VersionedTableResource(
        version,
        {
            release: TableResource(
                f"{_annotations_root(release)}/gnomad_genomes_v{release}.frequencies{'.' + subset if subset else ''}.ht"
            )
            for release in RELEASES
        },
    )
예제 #6
0
def get_callset_truth_data(
        truth_sample: str,
        mt: bool = True) -> Union[MatrixTableResource, TableResource]:
    """
    Get resources for the truth sample data that is subset from the full callset

    If `mt` this will return the truth sample MatrixTable (subset from callset); otherwise it returns the
    merged truth sample Table that includes both the truth data and the data from the callset

    :param str truth_sample: Name of the truth sample
    :param bool mt: Whether path is for a MatrixTable, default is True
    :return: Path to callset truth sample MT
    :rtype: str
    """
    if mt:
        return VersionedMatrixTableResource(
            CURRENT_RELEASE,
            {
                release: MatrixTableResource(
                    f"{get_variant_qc_root(release)}/truth_samples/{truth_sample}.mt"
                )
                for release in RELEASES
            },
        )
    else:
        return VersionedTableResource(
            CURRENT_RELEASE,
            {
                release: TableResource(
                    f"{get_variant_qc_root(release)}/truth_samples/{truth_sample}.ht"
                )
                for release in RELEASES
            },
        )
예제 #7
0
def coverage(data_type: str) -> VersionedTableResource:
    """
    Retrieve gnomAD's coverage table by data_type.

    :param data_type: One of "exomes" or "genomes"
    :return: Coverage Table
    """
    if data_type not in DATA_TYPES:
        raise DataException(f"{data_type} not in {DATA_TYPES}")

    if data_type == "exomes":
        current_release = "2.1"
        releases = [r for r in EXOME_RELEASES if r != "2.1.1"]
    else:
        current_release = "2.1"
        releases = [r for r in GENOME_RELEASES if r != "2.1.1"]

    return VersionedTableResource(
        current_release,
        {
            release:
            TableResource(path=_public_coverage_ht_path(data_type, release))
            for release in releases
        },
    )
예제 #8
0
def coverage(data_type: str) -> VersionedTableResource:
    """
    Retrieves gnomAD's coverage table by data_type

    :param data_type: One of "exomes" or "genomes"
    :return: Coverage Table
    """
    if data_type not in DATA_TYPES:
        raise DataException(
            f"{data_type} not in {DATA_TYPES}, please select a data type from {DATA_TYPES}"
        )

    if data_type == "exomes":
        current_release = CURRENT_EXOME_RELEASE
        releases = EXOME_RELEASES
    else:
        current_release = CURRENT_GENOME_COVERAGE_RELEASE
        releases = GENOME_COVERAGE_RELEASES

    return VersionedTableResource(
        current_release,
        {
            release:
            TableResource(path=_public_coverage_ht_path(data_type, release))
            for release in releases
        },
    )
예제 #9
0
def get_info(split: bool = True) -> TableResource:
    """
    Gets the gnomAD v3 info TableResource

    :param split: Whether to return the split or multi-allelic version of the resource
    :return: gnomAD v3 info TableResource
    """
    path = '{}/gnomad_genomes_v3_info{}.ht'.format(ANNOTATIONS_ROOT,
                                                   '.split' if split else '')
    return TableResource(path)
예제 #10
0
def get_filtering_model(model_id: str, split: bool = True) -> TableResource:
    """
       Gets the specified filtering annotation resource.

       :param model_id: Filtering model id
       :param split: Split or multi-allelic version of the filtering file
       :return: Filtering annotation file
       """
    path = '{}/{}{}.ht'.format(VARIANT_QC_ROOT, model_id,
                               '.split' if split else '')
    return TableResource(path)
예제 #11
0
def public_pca_loadings(subpop: str = "") -> TableResource:
    """
    Returns the TableResource containing sites and loadings from population PCA

    :param subpop: Can be empty ("") -> global, "eas" or "nfe"
    :return: gnomAD public PCA loadings TableResource
    """
    if subpop not in ["", "eas", "nfe"]:
        raise DataException(
            'Available subpops are "eas" or "nfe", default value "" for global'
        )

    return TableResource(path=_public_pca_ht_path(subpop))
예제 #12
0
def ancestry_pca_eigenvalues(
    include_unreleasable_samples: bool = False, ) -> VersionedTableResource:
    """
    Gets the ancestry PCA eigenvalues VersionedTableResource

    :param include_unreleasable_samples: Whether to get the PCA that included unreleasable in training
    :return: Ancestry PCA eigenvalues
    """
    return VersionedTableResource(
        CURRENT_RELEASE, {
            release: TableResource(
                _get_ancestry_pca_ht_path("eigenvalues", release,
                                          include_unreleasable_samples))
            for release in RELEASES
        })
예제 #13
0
def get_sample_qc(strat: str = "all") -> VersionedTableResource:
    """
    Gets sample QC annotations generated by Hail for the specified stratification:
        - bi_allelic
        - multi_allelic
        - all

    :param strat: Which stratification to return
    :return: Sample QC table
    """
    return VersionedTableResource(
        CURRENT_RELEASE, {
            release: TableResource(
                f"{get_sample_qc_root(release)}/sample_qc_{strat}.ht")
            for release in RELEASES
        })
예제 #14
0
def release_sites(public: bool = False) -> VersionedTableResource:
    """
    Retrieve versioned resource for sites-only release Table.

    :param public: Determines whether release sites Table is read from public or private bucket. Defaults to private
    :return: Sites-only release Table
    """
    return VersionedTableResource(
        CURRENT_RELEASE,
        {
            release: TableResource(
                path=release_ht_path(release_version=release, public=public)
            )
            for release in RELEASES
        },
    )
예제 #15
0
def get_rf_training(model_id: str) -> VersionedTableResource:
    """
    Get the training data for a given run

    :param model_id: RF run to load
    :return: VersionedTableResource for RF training data
    """
    return VersionedTableResource(
        CURRENT_RELEASE,
        {
            release: TableResource(
                f"{get_variant_qc_root(release)}/rf/models/{model_id}/training.ht"
            )
            for release in RELEASES
        },
    )
예제 #16
0
def get_rf_result(model_id: Optional[str] = None) -> VersionedTableResource:
    """
    Get the results of RF filtering for a given run

    :param model_id: RF run to load
    :return: VersionedTableResource for RF filtered data
    """
    return VersionedTableResource(
        CURRENT_RELEASE,
        {
            release: TableResource(
                f"{get_variant_qc_root(release)}/rf/models/{model_id}/rf_result.ht"
            )
            for release in RELEASES
        },
    )
예제 #17
0
def hgdp_1kg_subset_annotations(sample: bool = True) -> VersionedTableResource:
    """
    Get the HGDP + 1KG subset release sample or variant TableResource.

    :param sample: If true, will return the sample annotations, otherwise will return the variant annotations
    :return: Table resource with sample/variant annotations for the subset
    """
    return VersionedTableResource(
        CURRENT_RELEASE,
        {
            release: TableResource(
                f"gs://gnomad/release/{release}/ht/gnomad.genomes.v{release}.hgdp_1kg_subset{f'_sample_meta' if sample else '_variant_annotations'}.ht"
            )
            for release in RELEASES
            if release != "3"
        },
    )
예제 #18
0
def get_score_bins(model_id: str, aggregated: bool) -> VersionedTableResource:
    """
    Returns the path to a Table containing RF or VQSR scores and annotated with a bin based on rank of the metric scores.

    :param model_id: RF or VQSR model ID for which to return score data.
    :param bool aggregated: Whether to get the aggregated data.
         If True, will return the path to Table grouped by bin that contains aggregated variant counts per bin.
    :return: Path to desired hail Table
    """
    return VersionedTableResource(
        CURRENT_RELEASE,
        {
            release: TableResource(
                f"{get_variant_qc_root(release)}/score_bins/{model_id}.{'aggregated' if aggregated else 'bins'}.ht"
            )
            for release in RELEASES
        },
    )
예제 #19
0
def get_info(split: bool = True) -> VersionedTableResource:
    """
    Gets the gnomAD v3 info TableResource

    :param split: Whether to return the split or multi-allelic version of the resource
    :return: gnomAD v3 info VersionedTableResource
    """

    return VersionedTableResource(
        CURRENT_RELEASE,
        {
            release:
            TableResource(path="{}/gnomad_genomes_v{}_info{}.ht".format(
                _annotations_root(release), release,
                ".split" if split else ""))
            for release in RELEASES
        },
    )
def get_rf(
    data: str = "rf_result",
    run_hash: Optional[str] = None,
) -> Union[str, TableResource]:
    """
    Gets the path to the desired RF data.
    Data can take the following values:
        - 'training': path to the training data for a given run
        - 'model': path to pyspark pipeline RF model
        - 'rf_result' (default): path to HT containing result of RF filtering
    :param str data: One of 'training', 'model' or 'rf_result' (default)
    :param str run_hash: Hash of RF run to load
    :return: Path to desired RF data
    """

    if data == "model":
        return f"{tmp_dir}/models/{run_hash}/{data}.model"
    else:
        return TableResource(f"{tmp_dir}/models/{run_hash}/{data}.ht")
예제 #21
0
def get_binned_concordance(model_id: str,
                           truth_sample: str) -> VersionedTableResource:
    """
    Returns the path to a truth sample concordance Table (containing TP, FP, FN) between a truth sample within the
    callset and the sample's truth data, grouped by bins of a metric (RF or VQSR scores)

    :param model_id: RF or VQSR model ID for which to return score data.
    :param truth_sample: Which truth sample concordance to analyze (e.g., "NA12878" or "syndip")
    :return: Path to binned truth data concordance Hail Table
    """
    return VersionedTableResource(
        CURRENT_RELEASE,
        {
            release: TableResource(
                f"{get_variant_qc_root(release)}/binned_concordance/{truth_sample}_{model_id}_binned_concordance.ht"
            )
            for release in RELEASES
        },
    )
예제 #22
0
def ld_index(pop: str) -> TableResource:
    """Get resource for the LD indices for the given population."""
    return TableResource(path=_ld_index_path("genomes", pop))
예제 #23
0
def get_score_quantile_bins(model_id: str, aggregated: bool) -> TableResource:
    return TableResource('{}/{}.{}.ht'.format(
        f"{tmp_dir}", model_id, 'binned' if aggregated else 'rank'))
예제 #24
0
na12878_giab = MatrixTableResource(
    path="gs://gnomad-public/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.mt",
    import_func=hl.import_vcf,
    import_args={
        "path": "gs://gnomad-public/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.vcf.bgz",
        "force_bgz": True,
        "min_partitions": 100,
        "reference_genome": "GRCh37",
    },
)

hapmap = TableResource(
    path="gs://gnomad-public/resources/grch37/hapmap/hapmap_3.3.b37.ht",
    import_func=import_sites_vcf,
    import_args={
        "path": "gs://gnomad-public/resources/grch37/hapmap/hapmap_3.3.b37.vcf.bgz",
        "force_bgz": True,
        "min_partitions": 100,
        "reference_genome": "GRCh37",
    },
)

kgp_omni = TableResource(
    path="gs://gnomad-public/resources/grch37/kgp/1000G_omni2.5.b37.ht",
    import_func=import_sites_vcf,
    import_args={
        "path": "gs://gnomad-public/resources/grch37/kgp/1000G_omni2.5.b37.vcf.bgz",
        "force_bgz": True,
        "min_partitions": 100,
        "reference_genome": "GRCh37",
    },
)
예제 #25
0
        },
    )


def get_rf_result(model_id: Optional[str] = None) -> VersionedTableResource:
    """
    Get the results of RF filtering for a given run

    :param model_id: RF run to load
    :return: VersionedTableResource for RF filtered data
    """
    return VersionedTableResource(
        CURRENT_RELEASE,
        {
            release: TableResource(
                f"{get_variant_qc_root(release)}/rf/models/{model_id}/rf_result.ht"
            )
            for release in RELEASES
        },
    )


final_filter = VersionedTableResource(
    CURRENT_RELEASE,
    {
        release:
        TableResource(f"{get_variant_qc_root(release)}/final_filter.ht")
        for release in RELEASES
    },
)
예제 #26
0
def _import_clinvar(**kwargs) -> hl.Table:
    clinvar = import_sites_vcf(**kwargs)
    clinvar = clinvar.filter(
        hl.len(clinvar.alleles) > 1
    )  # Get around problematic single entry in alleles array in the clinvar vcf
    clinvar = vep_or_lookup_vep(clinvar, reference="GRCh38")
    return clinvar


# Resources with no versioning needed
purcell_5k_intervals = TableResource(
    path=
    "gs://gnomad-public/resources/grch38/purcell_5k_intervals/purcell5k.ht",
    import_func=_import_purcell_5k,
    import_args={
        "path":
        "gs://gnomad-public/resources/grch38/purcell_5k_intervals/purcell5k.interval_list",
    },
)

na12878_giab = MatrixTableResource(
    path=
    "gs://gnomad-public/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.mt",
    import_func=hl.import_vcf,
    import_args={
        "path":
        "gs://gnomad-public/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz",
        "force_bgz": True,
        "min_partitions": 100,
        "reference_genome": "GRCh38",
예제 #27
0
파일: meta.py 프로젝트: edenkal13/gnomad_qc
    """
    Gets the path to the finalized sample metadata information after sample QC

    :param version: gnomAD release version
    :param meta_version: metadata version to return
    :return: String path to the finalized metadata
    """
    return (
        f"{_meta_root_path(version)}/gnomad_v{version}_metadata_v{meta_version}.tsv.gz"
    )


_meta_versions = {
    "3.1":
    TableResource(
        path=
        "gs://gnomad/metadata/genomes_v3.1/gnomad_v3.1_sample_qc_metadata.ht"),
    "3":
    TableResource(
        path="gs://gnomad/metadata/genomes_v3/gnomad_v3_metadata_2019-09-27.ht"
    ),
}

_project_meta_versions = {
    "3.1":
    TableResource(
        path="gs://gnomad/metadata/genomes_v3.1/v3.1_project_meta.ht"),
    "3":
    TableResource(
        path="gs://gnomad/metadata/genomes_v3/09-09-2019_v3_project_meta.ht",
        import_func=hl.import_table,
예제 #28
0
def ld_index(pop: str) -> TableResource:
    return TableResource(path=_ld_index_path('genomes', pop))
예제 #29
0
def ld_scores(pop: str) -> TableResource:
    return TableResource(path=_ld_scores_path('genomes', pop))
예제 #30
0
def ld_scores(pop: str) -> TableResource:
    """Get resource for the LD scores for the given population."""
    return TableResource(path=_ld_scores_path("genomes", pop))