def coverage_tsv_path(data_type: str, version: Optional[str] = None) -> str: """ Retrieves gnomAD's coverage table by data_type :param data_type: One of "exomes" or "genomes" :return: Coverage Table """ if data_type not in DATA_TYPES: raise DataException( f"{data_type} not in {DATA_TYPES}, please select a data type from {DATA_TYPES}" ) if data_type == "exomes": if version is None: version = CURRENT_EXOME_RELEASE elif version not in EXOME_RELEASES: raise DataException( f"Version {version} of gnomAD exomes for GRCh38 does not exist" ) else: if version is None: version = CURRENT_GENOME_COVERAGE_RELEASE elif version not in GENOME_COVERAGE_RELEASES: raise DataException( f"Version {version} of gnomAD genomes for GRCh38 does not exist" ) return f"gs://gnomad-public/release/{version}/coverage/{data_type}/gnomad.{data_type}.r{version}.coverage.summary.tsv.bgz"
def get_freq(version: str = CURRENT_RELEASE, subset: Optional[str] = None) -> VersionedTableResource: """ Get the frequency annotation table for a specified release. :param version: Version of annotation path to return :param subset: One of the official subsets of the specified release (e.g., non_neuro, non_cancer, controls_and_biobanks) :return: Hail Table containing subset or overall cohort frequency annotations """ if version == "3" and subset: raise DataException("Subsets of gnomAD v3 do not exist") if subset and subset not in SUBSETS: raise DataException( f"{subset} subset is not one of the following official subsets: {SUBSETS}" ) return VersionedTableResource( version, { release: TableResource( f"{_annotations_root(release)}/gnomad_genomes_v{release}.frequencies{'.' + subset if subset else ''}.ht" ) for release in RELEASES }, )
def coverage(data_type: str) -> VersionedTableResource: """ Retrieves gnomAD's coverage table by data_type :param data_type: One of "exomes" or "genomes" :return: Coverage Table """ if data_type not in DATA_TYPES: raise DataException( f"{data_type} not in {DATA_TYPES}, please select a data type from {DATA_TYPES}" ) if data_type == "exomes": current_release = CURRENT_EXOME_RELEASE releases = EXOME_RELEASES else: current_release = CURRENT_GENOME_COVERAGE_RELEASE releases = GENOME_COVERAGE_RELEASES return VersionedTableResource( current_release, { release: TableResource(path=_public_coverage_ht_path(data_type, release)) for release in releases }, )
def liftover(data_type: str) -> VersionedTableResource: """ Get the 38 liftover of gnomad v2.1.1 :param data_type: One of "exomes" or "genomes" :return: Release Table """ if data_type not in DATA_TYPES: raise DataException(f"{data_type} not in {DATA_TYPES}") if data_type == "exomes": current_release = CURRENT_EXOME_RELEASE releases = EXOME_RELEASES releases.remove("2.1") else: current_release = CURRENT_GENOME_RELEASE releases = GENOME_RELEASES return VersionedTableResource( current_release, { release: TableResource(path=_liftover_data_path(data_type, release)) for release in releases }, )
def public_release(data_type: str) -> VersionedTableResource: """ Retrieves publicly released versioned table resource :param data_type: One of "exomes" or "genomes" :return: Release Table """ if data_type not in DATA_TYPES: raise DataException(f"{data_type} not in {DATA_TYPES}") if data_type == "exomes": current_release = CURRENT_EXOME_RELEASE releases = EXOME_RELEASES else: current_release = CURRENT_GENOME_RELEASE releases = GENOME_RELEASES return VersionedTableResource( current_release, { release: TableResource(path=_public_release_ht_path(data_type, release)) for release in releases }, )
def coverage(data_type: str) -> VersionedTableResource: """ Retrieve gnomAD's coverage table by data_type. :param data_type: One of "exomes" or "genomes" :return: Coverage Table """ if data_type not in DATA_TYPES: raise DataException(f"{data_type} not in {DATA_TYPES}") if data_type == "exomes": current_release = "2.1" releases = [r for r in EXOME_RELEASES if r != "2.1.1"] else: current_release = "2.1" releases = [r for r in GENOME_RELEASES if r != "2.1.1"] return VersionedTableResource( current_release, { release: GnomadPublicTableResource( path=_public_coverage_ht_path(data_type, release)) for release in releases }, )
def pre_process_subset_freq(subset: str, global_ht: hl.Table, test: bool = False) -> hl.Table: """ Prepare subset frequency Table by filling in missing frequency fields for loci present only in the global cohort. .. note:: The resulting final `freq` array will be as long as the subset `freq_meta` global (i.e., one `freq` entry for each `freq_meta` entry) :param subset: subset ID :param global_ht: Hail Table containing all variants discovered in the overall release cohort :param test: If True, filter to small region on chr20 :return: Table containing subset frequencies with missing freq structs filled in """ # Read in subset HTs subset_ht_path = get_freq(subset=subset).path subset_chr20_ht_path = qc_temp_prefix() + f"chr20_test_freq.{subset}.ht" if test: if file_exists(subset_chr20_ht_path): logger.info( "Loading chr20 %s subset frequency data for testing: %s", subset, subset_chr20_ht_path, ) subset_ht = hl.read_table(subset_chr20_ht_path) elif file_exists(subset_ht_path): logger.info( "Loading %s subset frequency data for testing: %s", subset, subset_ht_path, ) subset_ht = hl.read_table(subset_ht_path) subset_ht = hl.filter_intervals( subset_ht, [hl.parse_locus_interval("chr20:1-1000000")]) elif file_exists(subset_ht_path): logger.info("Loading %s subset frequency data: %s", subset, subset_ht_path) subset_ht = hl.read_table(subset_ht_path) else: raise DataException( f"Hail Table containing {subset} subset frequencies not found. You may need to run the script generate_freq_data.py to generate frequency annotations first." ) # Fill in missing freq structs ht = subset_ht.join(global_ht.select().select_globals(), how="right") ht = ht.annotate(freq=hl.if_else( hl.is_missing(ht.freq), hl.map(lambda x: missing_callstats_expr(), hl.range(hl.len(ht.freq_meta))), ht.freq, )) return ht
def public_pca_loadings(subpop: str = "") -> TableResource: """ Returns the TableResource containing sites and loadings from population PCA :param subpop: Can be empty ("") -> global, "eas" or "nfe" :return: gnomAD public PCA loadings TableResource """ if subpop not in ["", "eas", "nfe"]: raise DataException( 'Available subpops are "eas" or "nfe", default value "" for global' ) return TableResource(path=_public_pca_ht_path(subpop))
def release_vcf_path(data_type: str, version: str, contig: str) -> str: """ Publically released VCF. Provide specific contig, i.e. "20", to retrieve contig specific VCF. :param data_type: One of "exomes" or "genomes" :param version: One of the release versions of gnomAD on GRCh37 :param contig: Single contig "1" to "Y" :return: Path to VCF """ if not version.startswith("2"): raise DataException( f"gnomAD version {version} is not available on reference genome GRCh37" ) contig = f".{contig}" if contig else "" return f"gs://gcp-public-data--gnomad/release/{version}/vcf/{data_type}/gnomad.{data_type}.r{version}.sites{contig}.vcf.bgz"
def subset_samples_and_variants( mt: hl.MatrixTable, sample_path: str, header: bool = True, table_key: str = "s", sparse: bool = False, gt_expr: str = "GT", ) -> hl.MatrixTable: """ Subset the MatrixTable to the provided list of samples and their variants. :param mt: Input MatrixTable :param sample_path: Path to a file with list of samples :param header: Whether file with samples has a header. Default is True :param table_key: Key to sample Table. Default is "s" :param sparse: Whether the MatrixTable is sparse. Default is False :param gt_expr: Name of field in MatrixTable containing genotype expression. Default is "GT" :return: MatrixTable subsetted to specified samples and their variants """ sample_ht = hl.import_table(sample_path, no_header=not header, key=table_key) sample_count = sample_ht.count() missing_ht = sample_ht.anti_join(mt.cols()) missing_ht_count = missing_ht.count() full_count = mt.count_cols() if missing_ht_count != 0: missing_samples = missing_ht.s.collect() raise DataException( f"Only {sample_count - missing_ht_count} out of {sample_count} " "subsetting-table IDs matched IDs in the MT.\n" f"IDs that aren't in the MT: {missing_samples}\n") mt = mt.semi_join_cols(sample_ht) if sparse: mt = mt.filter_rows( hl.agg.any(mt[gt_expr].is_non_ref() | hl.is_defined(mt.END))) else: mt = mt.filter_rows(hl.agg.any(mt[gt_expr].is_non_ref())) logger.info( "Finished subsetting samples. Kept %d out of %d samples in MT", mt.count_cols(), full_count, ) return mt
def subset_samples_and_variants( mtds: Union[hl.MatrixTable, hl.vds.VariantDataset], sample_path: str, header: bool = True, table_key: str = "s", sparse: bool = False, gt_expr: str = "GT", remove_dead_alleles: bool = False, ) -> Union[hl.MatrixTable, hl.vds.VariantDataset]: """ Subset the MatrixTable or VariantDataset to the provided list of samples and their variants. :param mtds: Input MatrixTable or VariantDataset :param sample_path: Path to a file with list of samples :param header: Whether file with samples has a header. Default is True :param table_key: Key to sample Table. Default is "s" :param sparse: Whether the MatrixTable is sparse. Default is False :param gt_expr: Name of field in MatrixTable containing genotype expression. Default is "GT" :param remove_dead_alleles: Remove alleles observed in no samples. This option is currently only relevant when `mtds` is a VariantDataset. Default is False :return: MatrixTable or VariantDataset subsetted to specified samples and their variants """ sample_ht = hl.import_table(sample_path, no_header=not header, key=table_key) sample_count = sample_ht.count() is_vds = isinstance(mtds, hl.vds.VariantDataset) if is_vds: mt = mtds.variant_data else: if remove_dead_alleles: raise ValueError( "Removal of alleles observed in no samples is currently only implemented when the input dataset is a VariantDataset." ) mt = mtds missing_ht = sample_ht.anti_join(mt.cols()) missing_ht_count = missing_ht.count() full_count = mt.count_cols() if missing_ht_count != 0: missing_samples = missing_ht.s.collect() raise DataException( f"Only {sample_count - missing_ht_count} out of {sample_count} " f"subsetting-table IDs matched IDs in the {'VariantDataset' if is_vds else 'MatrixTable'}.\n" f"IDs that aren't in the MT: {missing_samples}\n") if is_vds: mtds = hl.vds.filter_samples(mtds, sample_ht, keep=True, remove_dead_alleles=remove_dead_alleles) n_cols = mtds.variant_data.count_cols() else: mtds = mtds.semi_join_cols(sample_ht) if sparse: mtds = mtds.filter_rows( hl.agg.any(mtds[gt_expr].is_non_ref() | hl.is_defined(mtds.END))) else: mtds = mtds.filter_rows(hl.agg.any(mtds[gt_expr].is_non_ref())) n_cols = mtds.count_cols() logger.info( "Finished subsetting samples. Kept %d out of %d samples in %s", n_cols, full_count, "VariantDataset" if is_vds else "MatrixTable", ) return mtds
def filter_low_conf_regions( mt: Union[hl.MatrixTable, hl.Table], filter_lcr: bool = True, filter_decoy: bool = True, filter_segdup: bool = True, filter_exome_low_coverage_regions: bool = False, filter_telomeres_and_centromeres: bool = False, high_conf_regions: Optional[List[str]] = None, ) -> Union[hl.MatrixTable, hl.Table]: """ Filter low-confidence regions. :param mt: MatrixTable or Table to filter :param filter_lcr: Whether to filter LCR regions :param filter_decoy: Whether to filter decoy regions :param filter_segdup: Whether to filter Segdup regions :param filter_exome_low_coverage_regions: Whether to filter exome low confidence regions :param filter_telomeres_and_centromeres: Whether to filter telomeres and centromeres :param high_conf_regions: Paths to set of high confidence regions to restrict to (union of regions) :return: MatrixTable or Table with low confidence regions removed """ build = get_reference_genome(mt.locus).name if build == "GRCh37": import gnomad.resources.grch37.reference_data as resources elif build == "GRCh38": import gnomad.resources.grch38.reference_data as resources criteria = [] if filter_lcr: lcr = resources.lcr_intervals.ht() criteria.append(hl.is_missing(lcr[mt.locus])) if filter_decoy: decoy = resources.decoy_intervals.ht() criteria.append(hl.is_missing(decoy[mt.locus])) if filter_segdup: segdup = resources.seg_dup_intervals.ht() criteria.append(hl.is_missing(segdup[mt.locus])) if filter_exome_low_coverage_regions: high_cov = resources.high_coverage_intervals.ht() criteria.append(hl.is_missing(high_cov[mt.locus])) if filter_telomeres_and_centromeres: if build != "GRCh38": raise DataException( "The telomeres_and_centromeres resource only exists for GRCh38" ) telomeres_and_centromeres = resources.telomeres_and_centromeres.ht() criteria.append(hl.is_missing(telomeres_and_centromeres[mt.locus])) if high_conf_regions is not None: for region in high_conf_regions: region = hl.import_locus_intervals(region) criteria.append(hl.is_defined(region[mt.locus])) if criteria: filter_criteria = functools.reduce(operator.iand, criteria) if isinstance(mt, hl.MatrixTable): mt = mt.filter_rows(filter_criteria) else: mt = mt.filter(filter_criteria) return mt
def vep_config_path(ref: str = "GRCh37"): if ref not in VEP_REFERENCE_DATA.keys(): raise DataException("Select reference as one of: {}".format(",".join( VEP_REFERENCE_DATA.keys()))) return VEP_REFERENCE_DATA[ref]["vep_config"]
def main(args): hl.init(default_reference="GRCh38", log="/variant_histograms.log") logger.info("Loading ANNOTATIONS_HISTS dictionary...") if not file_exists(annotation_hists_path()): raise DataException( "Annotation hists JSON file not found. Need to create this JSON before running script!" ) with hl.hadoop_open(annotation_hists_path()) as a: ANNOTATIONS_HISTS = json.loads(a.read()) # NOTE: histogram aggregations on these metrics are done on the entire callset (not just PASS variants), on raw data ht = hl.read_table(release_ht_path(public=False)) ht = ht.select(freq=ht.freq, info=ht.info.select(*ANNOTATIONS_HISTS)) inbreeding_bin_ranges = ANNOTATIONS_HISTS["InbreedingCoeff"] # Remove InbreedingCoeff from ANNOTATIONS_HISTS. It requires different ranges by allele frequency and needs to be # handled differently. It is stored as a dictionary in annotation_hists_path ANNOTATIONS_HISTS.remove("InbreedingCoeff") logger.info("Getting info annotation histograms...") hist_ranges_expr = get_annotations_hists(ht, ANNOTATIONS_HISTS, LOG10_ANNOTATIONS) # Evaluate minimum and maximum values for each metric of interest to help determine the bounds of the hists # NOTE: Run this first, then update values in annotation_hists_path JSON as necessary if args.determine_bounds: logger.info( "Evaluating minimum and maximum values for each metric of interest. Maximum values capped at 1e10." ) minmax_dict = {} for metric in ANNOTATIONS_HISTS: minmax_dict[metric] = hl.struct( min=hl.agg.min(ht.info[metric]), max=hl.if_else( hl.agg.max(ht.info[metric]) < 1e10, hl.agg.max(ht.info[metric]), 1e10, ), ) minmax = ht.aggregate(hl.struct(**minmax_dict)) logger.info(f"Metrics bounds: {minmax}") else: logger.info( "Aggregating hists over ranges defined in the annotation_hists_path JSON file. --determine_bounds can " "be used to help define these ranges..." ) hists = ht.aggregate( hl.array( [ hist_expr.annotate(metric=hist_metric) for hist_metric, hist_expr in hist_ranges_expr.items() ] ) .extend( hl.array( hl.agg.group_by( create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF), hl.agg.hist( hl.log10(ht.info.QUALapprox), *ANNOTATIONS_HISTS["QUALapprox"], ), ) ).map(lambda x: x[1].annotate(metric="QUALapprox-" + x[0])) ) .extend( hl.array( hl.agg.group_by( create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF), hl.agg.hist( hl.log10(ht.info.AS_QUALapprox), *ANNOTATIONS_HISTS["AS_QUALapprox"], ), ) ).map(lambda x: x[1].annotate(metric="AS_QUALapprox-" + x[0])) ), _localize=False, ) # Defining hist range and bins for allele frequency groups because they needed different ranges ht = ht.annotate(af_bin=create_frequency_bins_expr_inbreeding(AF=ht.freq[1].AF)) inbreeding_hists = [ ht.aggregate( hl.agg.filter( ht.af_bin == x, hl.agg.hist(ht.info.InbreedingCoeff, *inbreeding_bin_ranges[x],), ) ).annotate(metric="InbreedingCoeff" + "-" + x) for x in inbreeding_bin_ranges ] hists = hl.eval(hl.json(hists)) inbreeding_hists = hl.eval(hl.json(inbreeding_hists)) # Note: The following removes '}' from the JSON stored in hists and '{' from the JSON stored in # inbreeding_hists then joins them together to be written out as a single JSON hists = hists[:-1] + "," + inbreeding_hists[1:] logger.info("Writing output") with hl.hadoop_open(qual_hists_json_path(), "w") as f: f.write(hists)
def main(args): hl.init(log="/create_release_ht.log", default_reference="GRCh38") # The concatenated HT contains all subset frequency annotations, plus the overall cohort frequency annotations, # concatenated together in a single freq annotation ('freq') # Load global frequency Table if args.test: global_freq_chr20_ht_path = "gs://gnomad-tmp/gnomad_freq/chr20_test_freq.ht" if file_exists(global_freq_chr20_ht_path): logger.info( "Loading chr20 global frequency data for testing: %s", global_freq_chr20_ht_path, ) global_freq_ht = (hl.read_table(global_freq_chr20_ht_path).select( "freq").select_globals("freq_meta")) elif file_exists(get_freq().path): logger.info("Loading global frequency data for testing: %s", get_freq().path) global_freq_ht = (hl.read_table( get_freq().path).select("freq").select_globals("freq_meta")) global_freq_ht = hl.filter_intervals( global_freq_ht, [hl.parse_locus_interval("chr20:1-1000000")]) elif file_exists(get_freq().path): logger.info("Loading global frequency data: %s", get_freq().path) global_freq_ht = (hl.read_table( get_freq().path).select("freq").select_globals("freq_meta")) else: raise DataException( "Hail Table containing global callset frequencies not found. You may need to run the script to generate frequency annotations first." ) # Load subset frequency Table(s) if args.test: test_subsets = args.test_subsets subset_freq_hts = [ pre_process_subset_freq(subset, global_freq_ht, test=True) for subset in test_subsets ] else: subset_freq_hts = [ pre_process_subset_freq(subset, global_freq_ht) for subset in SUBSETS ] logger.info("Concatenating subset frequencies...") freq_ht = hl.Table.multi_way_zip_join( [global_freq_ht] + subset_freq_hts, data_field_name="freq", global_field_name="freq_meta", ) freq_ht = freq_ht.transmute(freq=freq_ht.freq.flatmap(lambda x: x.freq)) freq_ht = freq_ht.transmute_globals( freq_meta=freq_ht.freq_meta.flatmap(lambda x: x.freq_meta)) # Create frequency index dictionary on concatenated array (i.e., including all subsets) # NOTE: non-standard downsampling values are created in the frequency script corresponding to population totals, so # callset-specific DOWNSAMPLINGS must be used instead of the generic DOWNSAMPLING values global_freq_ht = hl.read_table(get_freq().path) freq_ht = freq_ht.annotate_globals(freq_index_dict=make_freq_index_dict( freq_meta=hl.eval(freq_ht.freq_meta), pops=POPS, downsamplings=hl.eval(global_freq_ht.downsamplings), )) # Add back in all global frequency annotations not present in concatenated frequencies HT row_fields = global_freq_ht.row_value.keys() - freq_ht.row_value.keys() logger.info( "Adding back the following row annotations onto concatenated frequencies: %s", row_fields) freq_ht = freq_ht.annotate(**global_freq_ht[freq_ht.key].select( *row_fields)) global_fields = global_freq_ht.globals.keys() - freq_ht.globals.keys() global_fields.remove("downsamplings") logger.info( "Adding back the following global annotations onto concatenated frequencies: %s", global_fields) freq_ht = freq_ht.annotate_globals(**global_freq_ht.index_globals().select( *global_fields)) logger.info("Preparing release Table annotations...") ht = add_release_annotations(freq_ht) logger.info("Removing chrM and sites without filter...") ht = hl.filter_intervals(ht, [hl.parse_locus_interval("chrM")], keep=False) ht = ht.filter(hl.is_defined(ht.filters)) ht = ht.checkpoint( qc_temp_prefix() + "release/gnomad.genomes.v3.1.sites.chr20.ht" if args.test else release_sites().path, args.overwrite, ) logger.info("Final variant count: %d", ht.count()) ht.describe() ht.show() ht.summarize()