def get_callset_truth_data( truth_sample: str, mt: bool = True) -> Union[MatrixTableResource, TableResource]: """ Get resources for the truth sample data that is subset from the full callset If `mt` this will return the truth sample MatrixTable (subset from callset); otherwise it returns the merged truth sample Table that includes both the truth data and the data from the callset :param str truth_sample: Name of the truth sample :param bool mt: Whether path is for a MatrixTable, default is True :return: Path to callset truth sample MT :rtype: str """ if mt: return VersionedMatrixTableResource( CURRENT_RELEASE, { release: MatrixTableResource( f"{get_variant_qc_root(release)}/truth_samples/{truth_sample}.mt" ) for release in RELEASES }, ) else: return VersionedTableResource( CURRENT_RELEASE, { release: TableResource( f"{get_variant_qc_root(release)}/truth_samples/{truth_sample}.ht" ) for release in RELEASES }, )
def hgdp_1kg_subset(dense: bool = False) -> VersionedMatrixTableResource: """ Get the HGDP + 1KG subset release MatrixTableResource. :param dense: If True, return the dense MT; if False, return the sparse MT :return: MatrixTableResource for specific subset """ return VersionedMatrixTableResource( CURRENT_RELEASE, { release: MatrixTableResource( f"gs://gnomad/release/{release}/mt/gnomad.genomes.v{release}.hgdp_1kg_subset{f'_dense' if dense else '_sparse'}.mt" ) for release in RELEASES if release != "3" }, )
CURRENT_EXOME_RELEASE = "" CURRENT_GENOME_RELEASE = "3.0" CURRENT_GENOME_COVERAGE_RELEASE = "3.0.1" EXOME_RELEASES = [] GENOME_RELEASES = ["3.0"] GENOME_COVERAGE_RELEASES = GENOME_RELEASES + ["3.0.1"] DATA_TYPES = ["genomes"] GENOME_POPS = ["AFR", "AMI", "AMR", "ASJ", "EAS", "FIN", "NFE", "SAS", "OTH"] gnomad_syndip = VersionedMatrixTableResource( default_version="3.0", versions={ "3.0": MatrixTableResource( path= "gs://gnomad-public/truth-sets/hail-0.2/gnomad_v3_syndip.b38.mt") }, ) na12878 = VersionedMatrixTableResource( default_version="3.0", versions={ "3.0": MatrixTableResource( path="gs://gnomad-public/truth-sets/hail-0.2/gnomad_v3_na12878.mt") }, )
GnomadPublicTableResource( path= "gs://gnomad-public-requester-pays/resources/context/grch38_context_vep_annotated.v101.ht", ), }, ) syndip = VersionedMatrixTableResource( default_version="20180222", versions={ "20180222": GnomadPublicMatrixTableResource( path= "gs://gnomad-public-requester-pays/resources/grch38/syndip/syndip.b38_20180222.mt", import_func=hl.import_vcf, import_args={ "path": "gs://gnomad-public-requester-pays/resources/grch38/syndip/full.38.20180222.vcf.gz", "force_bgz": True, "min_partitions": 100, "reference_genome": "GRCh38", }, ) }, ) syndip_hc_intervals = VersionedTableResource( default_version="20180222", versions={ "20180222": GnomadPublicTableResource( path=
mixed_site=(hl.len(mt.alleles) > 2) & hl.any(lambda a: hl.is_indel(mt.alleles[0], a), mt.alleles[1:]) & hl.any(lambda a: hl.is_snp(mt.alleles[0], a), mt.alleles[1:]), ) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) return mt gnomad_v3_genotypes = VersionedMatrixTableResource( CURRENT_RELEASE, { "3": MatrixTableResource( "gs://gnomad/raw/hail-0.2/mt/genomes_v3/gnomad_genomes_v3.repartitioned.mt" ), "3.1": MatrixTableResource( "gs://gnomad/raw/genomes/3.1/gnomad_v3.1_sparse_unsplit.repartitioned.mt" ), }, ) def qc_temp_prefix(version: str = CURRENT_RELEASE) -> str: """ Returns path to temporary QC bucket. :param version: Version of annotation path to return :return: Path to bucket with temporary QC data """
) kgp_phase_3 = VersionedMatrixTableResource( default_version="phase_3_split", versions={ "phase_3_split": GnomadPublicMatrixTableResource( path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000Genomes_phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.split.mt", import_func=hl.import_vcf, import_args={ "path": "gs://genomics-public-data/1000-genomes-phase-3/vcf-20150220/ALL.chr*.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf", "force_bgz": True, "skip_invalid_loci": True, "min_partitions": 300, "reference_genome": "GRCh37", }, ), "phase_3": GnomadPublicMatrixTableResource( path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000Genomes_phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.mt", import_func=hl.import_vcf, import_args={ "path": "gs://genomics-public-data/1000-genomes-phase-3/vcf-20150220/ALL.chr*.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf", "force_bgz": True, "skip_invalid_loci": True, "min_partitions": 300, "reference_genome": "GRCh37", }, ), }, ) kgp = VersionedTableResource(
kin_expr=relatedness_ht.kin, ibd0_expr=relatedness_ht.ibd0, ibd1_expr=relatedness_ht.ibd1, ibd2_expr=relatedness_ht.ibd2, )) # QC Sites (gnomAD v2 QC sites, lifted over) gnomad_v2_qc_sites = TableResource( "gs://gnomad-public/resources/grch38/gnomad_v2_qc_sites_b38.ht") # Dense MT of samples at QC sites qc = VersionedMatrixTableResource( CURRENT_RELEASE, { release: MatrixTableResource( f"gs://gnomad/sample_qc/mt/genomes_v{release}/gnomad_v{release}_qc_mt_v2_sites_dense.mt" ) for release in RELEASES }) # PC relate PCA scores pc_relate_pca_scores = VersionedTableResource( CURRENT_RELEASE, { release: TableResource( f"{get_sample_qc_root(release)}/gnomad_v{release}_qc_mt_v2_sites_pc_scores.ht" ) for release in RELEASES }) # PC relate results relatedness = VersionedTableResource(