def public_release(data_type: str) -> VersionedTableResource: """ Retrieves publicly released versioned table resource :param data_type: One of "exomes" or "genomes" :return: Release Table """ if data_type not in DATA_TYPES: raise DataException(f"{data_type} not in {DATA_TYPES}") if data_type == "exomes": current_release = CURRENT_EXOME_RELEASE releases = EXOME_RELEASES else: current_release = CURRENT_GENOME_RELEASE releases = GENOME_RELEASES return VersionedTableResource( current_release, { release: TableResource(path=_public_release_ht_path(data_type, release)) for release in releases }, )
def liftover(data_type: str) -> VersionedTableResource: """ Get the 38 liftover of gnomad v2.1.1 :param data_type: One of "exomes" or "genomes" :return: Release Table """ if data_type not in DATA_TYPES: raise DataException(f"{data_type} not in {DATA_TYPES}") if data_type == "exomes": current_release = CURRENT_EXOME_RELEASE releases = EXOME_RELEASES releases.remove("2.1") else: current_release = CURRENT_GENOME_RELEASE releases = GENOME_RELEASES return VersionedTableResource( current_release, { release: TableResource(path=_liftover_data_path(data_type, release)) for release in releases }, )
def get_vqsr_filters( model_id: str, split: bool = True, finalized: bool = False, ) -> VersionedTableResource: """ Gets the specified VQSR filtering annotation resource. :param model_id: VQSR filtering model id :param split: Split or multi-allelic version of the filtering file :param finalized: Whether to return the raw VQSR table or the finalized VQSR table representing determined cutoffs :return: VQSR filtering annotation file """ return VersionedTableResource( CURRENT_RELEASE, { release: TableResource("{}/filtering/{}{}{}.ht".format( _annotations_root(release), model_id, ".finalized" if finalized else "", ".split" if split else "", )) for release in RELEASES }, )
def get_rf_annotations(adj: bool = False) -> VersionedTableResource: """ Returns the VersionedTableResource to the RF-ready annotated Table Annotations that are included in the Table: Features for RF: - InbreedingCoeff - variant_type - allele_type - n_alt_alleles - has_star - AS_QD - AS_pab_max - AS_MQRankSum - AS_SOR - AS_ReadPosRankSum Training sites (bool): - transmitted_singleton - fail_hard_filters - (ht.QD < 2) | (ht.FS > 60) | (ht.MQ < 30) :param bool adj: Whether to load 'adj' or 'raw' :return: Table with RF annotations """ return VersionedTableResource( CURRENT_RELEASE, { release: TableResource( f"{get_variant_qc_root(release)}/rf/rf_annotations.{'adj' if adj else 'raw'}.ht" ) for release in RELEASES }, )
def get_freq(version: str = CURRENT_RELEASE, subset: Optional[str] = None) -> VersionedTableResource: """ Get the frequency annotation table for a specified release. :param version: Version of annotation path to return :param subset: One of the official subsets of the specified release (e.g., non_neuro, non_cancer, controls_and_biobanks) :return: Hail Table containing subset or overall cohort frequency annotations """ if version == "3" and subset: raise DataException("Subsets of gnomAD v3 do not exist") if subset and subset not in SUBSETS: raise DataException( f"{subset} subset is not one of the following official subsets: {SUBSETS}" ) return VersionedTableResource( version, { release: TableResource( f"{_annotations_root(release)}/gnomad_genomes_v{release}.frequencies{'.' + subset if subset else ''}.ht" ) for release in RELEASES }, )
def get_callset_truth_data( truth_sample: str, mt: bool = True) -> Union[MatrixTableResource, TableResource]: """ Get resources for the truth sample data that is subset from the full callset If `mt` this will return the truth sample MatrixTable (subset from callset); otherwise it returns the merged truth sample Table that includes both the truth data and the data from the callset :param str truth_sample: Name of the truth sample :param bool mt: Whether path is for a MatrixTable, default is True :return: Path to callset truth sample MT :rtype: str """ if mt: return VersionedMatrixTableResource( CURRENT_RELEASE, { release: MatrixTableResource( f"{get_variant_qc_root(release)}/truth_samples/{truth_sample}.mt" ) for release in RELEASES }, ) else: return VersionedTableResource( CURRENT_RELEASE, { release: TableResource( f"{get_variant_qc_root(release)}/truth_samples/{truth_sample}.ht" ) for release in RELEASES }, )
def coverage(data_type: str) -> VersionedTableResource: """ Retrieve gnomAD's coverage table by data_type. :param data_type: One of "exomes" or "genomes" :return: Coverage Table """ if data_type not in DATA_TYPES: raise DataException(f"{data_type} not in {DATA_TYPES}") if data_type == "exomes": current_release = "2.1" releases = [r for r in EXOME_RELEASES if r != "2.1.1"] else: current_release = "2.1" releases = [r for r in GENOME_RELEASES if r != "2.1.1"] return VersionedTableResource( current_release, { release: TableResource(path=_public_coverage_ht_path(data_type, release)) for release in releases }, )
def coverage(data_type: str) -> VersionedTableResource: """ Retrieves gnomAD's coverage table by data_type :param data_type: One of "exomes" or "genomes" :return: Coverage Table """ if data_type not in DATA_TYPES: raise DataException( f"{data_type} not in {DATA_TYPES}, please select a data type from {DATA_TYPES}" ) if data_type == "exomes": current_release = CURRENT_EXOME_RELEASE releases = EXOME_RELEASES else: current_release = CURRENT_GENOME_COVERAGE_RELEASE releases = GENOME_COVERAGE_RELEASES return VersionedTableResource( current_release, { release: TableResource(path=_public_coverage_ht_path(data_type, release)) for release in releases }, )
def get_info(split: bool = True) -> TableResource: """ Gets the gnomAD v3 info TableResource :param split: Whether to return the split or multi-allelic version of the resource :return: gnomAD v3 info TableResource """ path = '{}/gnomad_genomes_v3_info{}.ht'.format(ANNOTATIONS_ROOT, '.split' if split else '') return TableResource(path)
def get_filtering_model(model_id: str, split: bool = True) -> TableResource: """ Gets the specified filtering annotation resource. :param model_id: Filtering model id :param split: Split or multi-allelic version of the filtering file :return: Filtering annotation file """ path = '{}/{}{}.ht'.format(VARIANT_QC_ROOT, model_id, '.split' if split else '') return TableResource(path)
def public_pca_loadings(subpop: str = "") -> TableResource: """ Returns the TableResource containing sites and loadings from population PCA :param subpop: Can be empty ("") -> global, "eas" or "nfe" :return: gnomAD public PCA loadings TableResource """ if subpop not in ["", "eas", "nfe"]: raise DataException( 'Available subpops are "eas" or "nfe", default value "" for global' ) return TableResource(path=_public_pca_ht_path(subpop))
def ancestry_pca_eigenvalues( include_unreleasable_samples: bool = False, ) -> VersionedTableResource: """ Gets the ancestry PCA eigenvalues VersionedTableResource :param include_unreleasable_samples: Whether to get the PCA that included unreleasable in training :return: Ancestry PCA eigenvalues """ return VersionedTableResource( CURRENT_RELEASE, { release: TableResource( _get_ancestry_pca_ht_path("eigenvalues", release, include_unreleasable_samples)) for release in RELEASES })
def get_sample_qc(strat: str = "all") -> VersionedTableResource: """ Gets sample QC annotations generated by Hail for the specified stratification: - bi_allelic - multi_allelic - all :param strat: Which stratification to return :return: Sample QC table """ return VersionedTableResource( CURRENT_RELEASE, { release: TableResource( f"{get_sample_qc_root(release)}/sample_qc_{strat}.ht") for release in RELEASES })
def release_sites(public: bool = False) -> VersionedTableResource: """ Retrieve versioned resource for sites-only release Table. :param public: Determines whether release sites Table is read from public or private bucket. Defaults to private :return: Sites-only release Table """ return VersionedTableResource( CURRENT_RELEASE, { release: TableResource( path=release_ht_path(release_version=release, public=public) ) for release in RELEASES }, )
def get_rf_training(model_id: str) -> VersionedTableResource: """ Get the training data for a given run :param model_id: RF run to load :return: VersionedTableResource for RF training data """ return VersionedTableResource( CURRENT_RELEASE, { release: TableResource( f"{get_variant_qc_root(release)}/rf/models/{model_id}/training.ht" ) for release in RELEASES }, )
def get_rf_result(model_id: Optional[str] = None) -> VersionedTableResource: """ Get the results of RF filtering for a given run :param model_id: RF run to load :return: VersionedTableResource for RF filtered data """ return VersionedTableResource( CURRENT_RELEASE, { release: TableResource( f"{get_variant_qc_root(release)}/rf/models/{model_id}/rf_result.ht" ) for release in RELEASES }, )
def hgdp_1kg_subset_annotations(sample: bool = True) -> VersionedTableResource: """ Get the HGDP + 1KG subset release sample or variant TableResource. :param sample: If true, will return the sample annotations, otherwise will return the variant annotations :return: Table resource with sample/variant annotations for the subset """ return VersionedTableResource( CURRENT_RELEASE, { release: TableResource( f"gs://gnomad/release/{release}/ht/gnomad.genomes.v{release}.hgdp_1kg_subset{f'_sample_meta' if sample else '_variant_annotations'}.ht" ) for release in RELEASES if release != "3" }, )
def get_score_bins(model_id: str, aggregated: bool) -> VersionedTableResource: """ Returns the path to a Table containing RF or VQSR scores and annotated with a bin based on rank of the metric scores. :param model_id: RF or VQSR model ID for which to return score data. :param bool aggregated: Whether to get the aggregated data. If True, will return the path to Table grouped by bin that contains aggregated variant counts per bin. :return: Path to desired hail Table """ return VersionedTableResource( CURRENT_RELEASE, { release: TableResource( f"{get_variant_qc_root(release)}/score_bins/{model_id}.{'aggregated' if aggregated else 'bins'}.ht" ) for release in RELEASES }, )
def get_info(split: bool = True) -> VersionedTableResource: """ Gets the gnomAD v3 info TableResource :param split: Whether to return the split or multi-allelic version of the resource :return: gnomAD v3 info VersionedTableResource """ return VersionedTableResource( CURRENT_RELEASE, { release: TableResource(path="{}/gnomad_genomes_v{}_info{}.ht".format( _annotations_root(release), release, ".split" if split else "")) for release in RELEASES }, )
def get_rf( data: str = "rf_result", run_hash: Optional[str] = None, ) -> Union[str, TableResource]: """ Gets the path to the desired RF data. Data can take the following values: - 'training': path to the training data for a given run - 'model': path to pyspark pipeline RF model - 'rf_result' (default): path to HT containing result of RF filtering :param str data: One of 'training', 'model' or 'rf_result' (default) :param str run_hash: Hash of RF run to load :return: Path to desired RF data """ if data == "model": return f"{tmp_dir}/models/{run_hash}/{data}.model" else: return TableResource(f"{tmp_dir}/models/{run_hash}/{data}.ht")
def get_binned_concordance(model_id: str, truth_sample: str) -> VersionedTableResource: """ Returns the path to a truth sample concordance Table (containing TP, FP, FN) between a truth sample within the callset and the sample's truth data, grouped by bins of a metric (RF or VQSR scores) :param model_id: RF or VQSR model ID for which to return score data. :param truth_sample: Which truth sample concordance to analyze (e.g., "NA12878" or "syndip") :return: Path to binned truth data concordance Hail Table """ return VersionedTableResource( CURRENT_RELEASE, { release: TableResource( f"{get_variant_qc_root(release)}/binned_concordance/{truth_sample}_{model_id}_binned_concordance.ht" ) for release in RELEASES }, )
def ld_index(pop: str) -> TableResource: """Get resource for the LD indices for the given population.""" return TableResource(path=_ld_index_path("genomes", pop))
def get_score_quantile_bins(model_id: str, aggregated: bool) -> TableResource: return TableResource('{}/{}.{}.ht'.format( f"{tmp_dir}", model_id, 'binned' if aggregated else 'rank'))
na12878_giab = MatrixTableResource( path="gs://gnomad-public/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.mt", import_func=hl.import_vcf, import_args={ "path": "gs://gnomad-public/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.vcf.bgz", "force_bgz": True, "min_partitions": 100, "reference_genome": "GRCh37", }, ) hapmap = TableResource( path="gs://gnomad-public/resources/grch37/hapmap/hapmap_3.3.b37.ht", import_func=import_sites_vcf, import_args={ "path": "gs://gnomad-public/resources/grch37/hapmap/hapmap_3.3.b37.vcf.bgz", "force_bgz": True, "min_partitions": 100, "reference_genome": "GRCh37", }, ) kgp_omni = TableResource( path="gs://gnomad-public/resources/grch37/kgp/1000G_omni2.5.b37.ht", import_func=import_sites_vcf, import_args={ "path": "gs://gnomad-public/resources/grch37/kgp/1000G_omni2.5.b37.vcf.bgz", "force_bgz": True, "min_partitions": 100, "reference_genome": "GRCh37", }, )
}, ) def get_rf_result(model_id: Optional[str] = None) -> VersionedTableResource: """ Get the results of RF filtering for a given run :param model_id: RF run to load :return: VersionedTableResource for RF filtered data """ return VersionedTableResource( CURRENT_RELEASE, { release: TableResource( f"{get_variant_qc_root(release)}/rf/models/{model_id}/rf_result.ht" ) for release in RELEASES }, ) final_filter = VersionedTableResource( CURRENT_RELEASE, { release: TableResource(f"{get_variant_qc_root(release)}/final_filter.ht") for release in RELEASES }, )
def _import_clinvar(**kwargs) -> hl.Table: clinvar = import_sites_vcf(**kwargs) clinvar = clinvar.filter( hl.len(clinvar.alleles) > 1 ) # Get around problematic single entry in alleles array in the clinvar vcf clinvar = vep_or_lookup_vep(clinvar, reference="GRCh38") return clinvar # Resources with no versioning needed purcell_5k_intervals = TableResource( path= "gs://gnomad-public/resources/grch38/purcell_5k_intervals/purcell5k.ht", import_func=_import_purcell_5k, import_args={ "path": "gs://gnomad-public/resources/grch38/purcell_5k_intervals/purcell5k.interval_list", }, ) na12878_giab = MatrixTableResource( path= "gs://gnomad-public/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.mt", import_func=hl.import_vcf, import_args={ "path": "gs://gnomad-public/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz", "force_bgz": True, "min_partitions": 100, "reference_genome": "GRCh38",
""" Gets the path to the finalized sample metadata information after sample QC :param version: gnomAD release version :param meta_version: metadata version to return :return: String path to the finalized metadata """ return ( f"{_meta_root_path(version)}/gnomad_v{version}_metadata_v{meta_version}.tsv.gz" ) _meta_versions = { "3.1": TableResource( path= "gs://gnomad/metadata/genomes_v3.1/gnomad_v3.1_sample_qc_metadata.ht"), "3": TableResource( path="gs://gnomad/metadata/genomes_v3/gnomad_v3_metadata_2019-09-27.ht" ), } _project_meta_versions = { "3.1": TableResource( path="gs://gnomad/metadata/genomes_v3.1/v3.1_project_meta.ht"), "3": TableResource( path="gs://gnomad/metadata/genomes_v3/09-09-2019_v3_project_meta.ht", import_func=hl.import_table,
def ld_index(pop: str) -> TableResource: return TableResource(path=_ld_index_path('genomes', pop))
def ld_scores(pop: str) -> TableResource: return TableResource(path=_ld_scores_path('genomes', pop))
def ld_scores(pop: str) -> TableResource: """Get resource for the LD scores for the given population.""" return TableResource(path=_ld_scores_path("genomes", pop))