def annotate_sex(mt: hl.MatrixTable, out_internal_mt_prefix: str, male_threshold: float = 0.8, female_threshold: float = 0.5) -> hl.MatrixTable: """ Imputes sex, exports data, and annotates mt with this data NOTE: Evaluated in R (plots) and decided on cutoff of F<0.5 for females and F>0.8 for males (default) for genomes :param MatrixTable mt: MT containing samples to be ascertained for sex :param str out_internal_mt_prefix: file path prefix for tsv containing samples and sex imputation annotations :return: MatrixTable with imputed sex annotations stashed in column annotation 'sex_check' :rtype: MatrixTable """ mt1 = hl.filter_intervals(mt, [hl.parse_locus_interval('chrX')]) #mt = mt.filter_rows(mt.locus.in_x_nonpar()) mtx_unphased = mt1.select_entries( GT=hl.unphased_diploid_gt_index_call(mt1.GT.n_alt_alleles())) #imputed_sex = hl.impute_sex(mtx_unphased.GT) sex_ht = hl.impute_sex(mtx_unphased.GT, aaf_threshold=0.05, female_threshold=female_threshold, male_threshold=male_threshold) sex_ht.export(out_internal_mt_prefix + '.sex_check.txt.bgz') sex_colnames = ['f_stat', 'is_female'] sex_ht = sex_ht.select(*sex_colnames) mt = mt.annotate_cols(**sex_ht[mt.col_key]) return mt
def annotate_sex(mt: hl.MatrixTable, male_threshold: float = 0.6, female_threshold: float = 0.4) -> hl.MatrixTable: """ Imputes sex, exports data, and annotates mt with this data NOTE: :param female_threshold: :param male_threshold: :param MatrixTable mt: MT containing samples to be ascertained for sex able """ # unphase MT mt = unphase_mt(mt) # impute data sex_ht = hl.impute_sex(mt.GT, aaf_threshold=0.05, female_threshold=female_threshold, male_threshold=male_threshold, include_par=False) sex_colnames = ['f_stat', 'is_female'] sex_ht = sex_ht.select(*sex_colnames) mt = mt.annotate_cols(**sex_ht[mt.col_key]) return mt
def test_impute_sex_same_as_plink(self): import subprocess as sp ds = hl.import_vcf(resource('x-chromosome.vcf')) sex = hl.impute_sex(ds.GT, include_par=True) vcf_file = utils.uri_path(utils.new_temp_file(prefix="plink", suffix="vcf")) out_file = utils.uri_path(utils.new_temp_file(prefix="plink")) hl.export_vcf(ds, vcf_file) try: out = sp.check_output( ["plink", "--vcf", vcf_file, "--const-fid", "--check-sex", "--silent", "--out", out_file], stderr=sp.STDOUT) except sp.CalledProcessError as e: print(e.output) raise e plink_sex = hl.import_table(out_file + '.sexcheck', delimiter=' +', types={'SNPSEX': hl.tint32, 'F': hl.tfloat64}) plink_sex = plink_sex.select('IID', 'SNPSEX', 'F') plink_sex = plink_sex.select( s=plink_sex.IID, is_female=hl.cond(plink_sex.SNPSEX == 2, True, hl.cond(plink_sex.SNPSEX == 1, False, hl.null(hl.tbool))), f_stat=plink_sex.F).key_by('s') sex = sex.select(s=sex.s, is_female=sex.is_female, f_stat=sex.f_stat) self.assertTrue(plink_sex._same(sex.select_globals(), tolerance=1e-3)) ds = ds.annotate_rows(aaf=(agg.call_stats(ds.GT, ds.alleles)).AF[1]) self.assertTrue(hl.impute_sex(ds.GT)._same(hl.impute_sex(ds.GT, aaf='aaf')))
def check_sex(mt): ''' Conducts sex imputation statistics for a site. Returns an mt with an annotated imputed sex column & a column which flags those who failed sex filter as True :param mt: hail matrix table which contains a reported sex column of 'F', 'M', or 'U' named "reported_sex" :return: hail matrix table with a new column named sex_filter containing the sex discrepancy filter flag ''' new_mt = hl.impute_sex(mt.GT) mt = mt.annotate_cols(imputedSex=new_mt[mt.s]) return mt.annotate_cols( sex_filter=mt.imputedSex.is_female != (mt.reported_sex == 'F'))
def impute_sex(mt): vcf_samples = mt.s.collect() imputed_sex = hl.impute_sex(mt.GT).collect() #for sample, imputed_sex_struct in zip(vcf_samples, imputed_sex): # print(f"{sample} {'F' if imputed_sex_struct.is_female else 'M'} {imputed_sex_struct.f_stat:0.3f} {imputed_sex_struct.observed_homs/imputed_sex_struct.expected_homs:0.2f}") is_female_dict = { sample: imputed_sex_struct.is_female for sample, imputed_sex_struct in zip(vcf_samples, imputed_sex) } return is_female_dict
def impute_sex_plot(mt, args, mt_to_annotate=None): """ Impute sex of individuals and plot resultant f stat values :param mt: maf pruned matrix table to caculate f stat values :param mt_to_annotate: matrix table to add sex information to :return: returns either annotated matrix table and imputed sex Hail table, if mt_to_annotate is not None, or else just the imputed sex Hail table. """ datestr = time.strftime("%Y.%m.%d") imputed_sex = hl.impute_sex(mt.GT, female_threshold=args.female_threshold, male_threshold=args.male_threshold) sex_count = imputed_sex.aggregate(hl.agg.counter(imputed_sex.is_female)) logging.info(f'Imputed sex count: {sex_count}') fstat_stats = imputed_sex.aggregate(hl.agg.stats(imputed_sex.f_stat)) fstat_hist = imputed_sex.aggregate( hl.agg.hist(imputed_sex.f_stat, fstat_stats.min, fstat_stats.max, 50)) output_file(f"{datestr}_imputed_sex_fstat_hist.html") p = hl.plot.histogram(fstat_hist, legend='F stat', title='F stat histogram') save(p) if mt_to_annotate is not None: mt_to_annotate = mt_to_annotate.annotate_cols( is_female_imputed=imputed_sex[mt_to_annotate.s].is_female, f_stat=imputed_sex[mt_to_annotate.s].f_stat) mt_to_annotate = mt_to_annotate.annotate_globals( sex_imputation_thresholds={ 'female_threshold': args.female_threshold, 'male_threshold': args.male_threshold }) mt = mt.annotate_cols(is_female_imputed=imputed_sex[mt.s].is_female) mt = mt.annotate_globals( sex_imputation_thresholds={ 'female_threshold': args.female_threshold, 'male_threshold': args.male_threshold }) args.sex_col = "is_female_imputed" args.male_tag = False args.female_tag = True return mt, imputed_sex, mt_to_annotate else: return mt, imputed_sex
def sex_violations(mt, input_type): # step 4 imputed_sex = hl.impute_sex(mt.GT) if input_type == "plink": # Verify that when sex info is missing value is set to None sex_exclude = mt.filter_cols( (mt.is_female != imputed_sex[mt.s].is_female) & (mt.is_female is not None)).s.collect() else: # Verify that when meta file is read in, column formatting is kept sex_exclude = mt.filter_cols( (mt.annotations.Sex != imputed_sex[mt.s].is_female) & (mt.annotations.Sex is not None)).s.collect() if len(sex_exclude) > 0: mt = mt.filter_cols(hl.literal(sex_exclude).contains(mt['s']), keep=False) results = {'sex_excluded': len(sex_exclude)} return mt, results
def filter_sex_check(mt, fhet_y, fhet_x): # step 3 imputed_sex = hl.impute_sex(mt.GT) f_stat_out = mt.filter_cols( ((imputed_sex[mt.s].f_stat < fhet_x) & (mt.is_female == False) | (imputed_sex[mt.s].f_stat > fhet_y) & (mt.is_female == True))).s.collect() if len(f_stat_out) > 0: mt = mt.filter_cols(hl.literal(f_stat_out).contains(mt['s']), keep=False) from .test_plots import fstat_plt import pandas as pd sex_check_plot = fstat_plt(imputed_sex, fhet_y, fhet_x) sex_check_table = pd.DataFrame(f_stat_out, columns=['SampleID']) results = { 'sex_check_removed': len(f_stat_out), 'sex_check_plot': sex_check_plot, 'sex_check_table': sex_check_table } return mt, results
def annotate_sex( mtds: Union[hl.MatrixTable, hl.vds.VariantDataset], is_sparse: bool = True, excluded_intervals: Optional[hl.Table] = None, included_intervals: Optional[hl.Table] = None, normalization_contig: str = "chr20", reference_genome: str = "GRCh38", sites_ht: Optional[hl.Table] = None, aaf_expr: Optional[str] = None, gt_expr: str = "GT", f_stat_cutoff: float = 0.5, aaf_threshold: float = 0.001, ) -> hl.Table: """ Impute sample sex based on X-chromosome heterozygosity and sex chromosome ploidy. Return Table with the following fields: - s (str): Sample - chr20_mean_dp (float32): Sample's mean coverage over chromosome 20. - chrX_mean_dp (float32): Sample's mean coverage over chromosome X. - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y. - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X. - chrY_ploidy (float32): Sample's imputed ploidy over chromosome Y. - f_stat (float64): Sample f-stat. Calculated using hl.impute_sex. - n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex. - expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex. - observed_homs (int64): Expected number of homozygotes. Calculated using hl.impute_sex. - X_karyotype (str): Sample's chromosome X karyotype. - Y_karyotype (str): Sample's chromosome Y karyotype. - sex_karyotype (str): Sample's sex karyotype. :param mtds: Input MatrixTable or VariantDataset :param bool is_sparse: Whether input MatrixTable is in sparse data format :param excluded_intervals: Optional table of intervals to exclude from the computation. :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes. :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies. :param reference_genome: Reference genome used for constructing interval list. Default: 'GRCh38' :param sites_ht: Optional Table to use. If present, filters input MatrixTable to sites in this Table prior to imputing sex, and pulls alternate allele frequency from this Table. :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency. :param gt_expr: Name of entry field storing the genotype. Default: 'GT' :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff. :param float aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations. :return: Table of samples and their imputed sex karyotypes. """ logger.info("Imputing sex chromosome ploidies...") is_vds = isinstance(mtds, hl.vds.VariantDataset) if is_vds: if excluded_intervals is not None: raise NotImplementedError( "excluded_intervals is not used when imputing sex chromosome ploidy for VDS" ) ploidy_ht = hl.vds.impute_sex_chromosome_ploidy( mtds, calling_intervals=included_intervals, normalization_contig=normalization_contig, ) ploidy_ht = ploidy_ht.rename( {"x_ploidy": "chrX_ploidy", "y_ploidy": "chrY_ploidy"} ) mt = mtds.variant_data else: mt = mtds if is_sparse: ploidy_ht = impute_sex_ploidy( mt, excluded_intervals, included_intervals, normalization_contig ) else: raise NotImplementedError( "Imputing sex ploidy does not exist yet for dense data." ) x_contigs = get_reference_genome(mt.locus).x_contigs logger.info("Filtering mt to biallelic SNPs in X contigs: %s", x_contigs) if "was_split" in list(mt.row): mt = mt.filter_rows((~mt.was_split) & hl.is_snp(mt.alleles[0], mt.alleles[1])) else: mt = mt.filter_rows( (hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1]) ) mt = hl.filter_intervals( mt, [ hl.parse_locus_interval(contig, reference_genome=reference_genome) for contig in x_contigs ], keep=True, ) if sites_ht is not None: if aaf_expr == None: logger.warning( "sites_ht was provided, but aaf_expr is missing. Assuming name of field with alternate allele frequency is 'AF'." ) aaf_expr = "AF" logger.info("Filtering to provided sites") mt = mt.annotate_rows(**sites_ht[mt.row_key]) mt = mt.filter_rows(hl.is_defined(mt[aaf_expr])) logger.info("Calculating inbreeding coefficient on chrX") sex_ht = hl.impute_sex( mt[gt_expr], aaf_threshold=aaf_threshold, male_threshold=f_stat_cutoff, female_threshold=f_stat_cutoff, aaf=aaf_expr, ) logger.info("Annotating sex ht with sex chromosome ploidies") sex_ht = sex_ht.annotate(**ploidy_ht[sex_ht.key]) logger.info("Inferring sex karyotypes") x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs(sex_ht, f_stat_cutoff) sex_ht = sex_ht.annotate_globals( x_ploidy_cutoffs=hl.struct( upper_cutoff_X=x_ploidy_cutoffs[0], lower_cutoff_XX=x_ploidy_cutoffs[1][0], upper_cutoff_XX=x_ploidy_cutoffs[1][1], lower_cutoff_XXX=x_ploidy_cutoffs[2], ), y_ploidy_cutoffs=hl.struct( lower_cutoff_Y=y_ploidy_cutoffs[0][0], upper_cutoff_Y=y_ploidy_cutoffs[0][1], lower_cutoff_YY=y_ploidy_cutoffs[1], ), f_stat_cutoff=f_stat_cutoff, ) return sex_ht.annotate( **get_sex_expr( sex_ht.chrX_ploidy, sex_ht.chrY_ploidy, x_ploidy_cutoffs, y_ploidy_cutoffs ) )
**hl.parse_variant(ht_pruned_chrx_variants.f0, reference_genome='GRCh38')) ht_pruned_chrx_variants = ht_pruned_chrx_variants.key_by( ht_pruned_chrx_variants.locus, ht_pruned_chrx_variants.alleles) mt = hl.read_matrix_table(MT_HARDCALLS) mt = mt.filter_cols(hl.is_defined(ht_initial_samples[mt.col_key])) mt = mt.filter_rows(hl.is_defined(ht_pruned_chrx_variants[mt.row_key])) n = mt.count() print('n samples:') print(n[1]) print('n variants:') print(n[0]) imputed_sex = hl.impute_sex(mt.GT, female_threshold=0.6, male_threshold=0.6) mt = mt.annotate_cols(phenotype=sample_annotations[mt.s]) mt = mt.annotate_cols(impute_sex=imputed_sex[mt.s]) mt.cols().select('impute_sex', 'phenotype').flatten().export(IMPUTESEX_FILE) # Want to change this to reflect the dataset that I have. mt.cols().write(IMPUTESEX_TABLE, overwrite=True) # Determine non-missing allele count on the y. mt = hl.read_matrix_table(MT_HARDCALLS) mt = mt.filter_cols(hl.is_defined(ht_initial_samples[mt.col_key])) mt = mt.filter_rows(mt.locus.in_y_nonpar() | mt.locus.in_y_par()) mt = hl.sample_qc(mt, name='qc') mt_cols = mt.cols() mt_cols.select(n_called=mt_cols.qc.n_called).export(Y_NCALLED)
pprint(a) mt_AF = mt.filter_rows(mt.variant_qc.AF[1] >= 0.01) ######## 3. QUALITY CONTROL SAMPLES ######## 3.1 Filter samples for outliers more than (6 * SD) from mean (Part 1) # Calculate sample statistics mt = hl.sample_qc(mt) # Calculate statistics on sample statistics stats_singleton = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.n_singleton)) stats_ti_tv = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.r_ti_tv)) stats_het_hom_var = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.r_het_hom_var)) stats_het = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.n_het)) ######## 3.2 Sex check on chromosome X (inbreeding coefficient) # Determine sex from GT calls in sex chromosomes t = hl.impute_sex(mt.GT) # Only keep those where genetic sex matches self-reported Sex mt = mt.filter_cols(t[mt.s].is_female == mt.is_female) ######## 3.3 Check for genetic relationship / "duplicates" # Calculate identity-by-descent matrix mt_relatedness = hl.identity_by_descent(mt) # keep pairs of samples with PI_HAT in [0.2, 1] using MAF computed from the dataset itself in row field panel_maf. t_ibd = relatedness.filter(relatedness.ibd.PI_HAT > 0.2) t_ibd.key_by('i') mt.key_cols_by("s") #Collect the IDs of the related samples in t_ibd ibd_idx = t_ibd.aggregate(hl.agg.collect_as_set(t_ibd.i)) mt_ibd = mt.filter_cols(hl.is_defined(ibd_idx)) ######### 3.3 Filter samples for outliers more than (6 * SD) from mean (Part 2)
#vds5 = hl.read_matrix_table(vds_common_file) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # sex imputation #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("sex imputation...") vdsnopar = vds5.filter_rows(hl.is_defined(par[vds5.locus]), keep=False) vdsnopar = vdsnopar.annotate_cols( ydp=hl.agg.count_where((vdsnopar.locus.contig == 'chrY') & (hl.is_defined(vdsnopar.GT)))) vdsx = vdsnopar.filter_rows((vdsnopar.locus.contig == "chrX") & (vdsnopar.variant_qc.AF >= 0.05) & (vdsnopar.variant_qc.AF <= 0.95)) ct = hl.impute_sex(vdsx.GT, female_threshold=0.6, male_threshold=0.7) vdsct = vdsnopar.cols() ct = ct.annotate(ydp=vdsct[ct.s].ydp) (ct.select(ID=ct.s, sexFstat=ct.f_stat, isFemale=ct.is_female, ydp=ct.ydp).export(sample_sex_fstat_file)) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ld pruning #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("LD pruning...") vds5_ldp = hl.ld_prune(vds5, n_cores=1600, r2=0.1) #vds5_ldp = hl.ld_prune(vds5, n_cores=60, r2=0.2, window=1000000, memory_per_core=512) print("writing LD pruned VDS...")
def checkSex(mt): new_mt = hl.impute_sex(mt.GT) mt = mt.annotate_cols(imputedSex=new_mt[mt.s]) return (mt.annotate_cols( sex_filter=mt.imputedSex.is_female != (mt.reported_sex == 'F')))
def annotate_sex( mtds: Union[hl.MatrixTable, hl.vds.VariantDataset], is_sparse: bool = True, excluded_intervals: Optional[hl.Table] = None, included_intervals: Optional[hl.Table] = None, normalization_contig: str = "chr20", sites_ht: Optional[hl.Table] = None, aaf_expr: Optional[str] = None, gt_expr: str = "GT", f_stat_cutoff: float = 0.5, aaf_threshold: float = 0.001, variants_only_x_ploidy: bool = False, variants_only_y_ploidy: bool = False, ) -> hl.Table: """ Impute sample sex based on X-chromosome heterozygosity and sex chromosome ploidy. Return Table with the following fields: - s (str): Sample - `normalization_contig`_mean_dp (float32): Sample's mean coverage over the specified `normalization_contig`. - chrX_mean_dp (float32): Sample's mean coverage over chromosome X. - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y. - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X. - chrY_ploidy (float32): Sample's imputed ploidy over chromosome Y. - f_stat (float64): Sample f-stat. Calculated using hl.impute_sex. - n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex. - expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex. - observed_homs (int64): Expected number of homozygotes. Calculated using hl.impute_sex. - X_karyotype (str): Sample's chromosome X karyotype. - Y_karyotype (str): Sample's chromosome Y karyotype. - sex_karyotype (str): Sample's sex karyotype. :param mtds: Input MatrixTable or VariantDataset :param bool is_sparse: Whether input MatrixTable is in sparse data format :param excluded_intervals: Optional table of intervals to exclude from the computation. :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes. :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies. :param sites_ht: Optional Table to use. If present, filters input MatrixTable to sites in this Table prior to imputing sex, and pulls alternate allele frequency from this Table. :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency. :param gt_expr: Name of entry field storing the genotype. Default: 'GT' :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff. :param float aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations. :param variants_only_x_ploidy: Whether to use depth of only variant data for the x ploidy estimation. :param variants_only_y_ploidy: Whether to use depth of only variant data for the y ploidy estimation. :return: Table of samples and their imputed sex karyotypes. """ logger.info("Imputing sex chromosome ploidies...") is_vds = isinstance(mtds, hl.vds.VariantDataset) if is_vds: if excluded_intervals is not None: raise NotImplementedError( "The use of the parameter 'excluded_intervals' is currently not implemented for imputing sex chromosome ploidy on a VDS!" ) # Begin by creating a ploidy estimate HT using the method defined by 'variants_only_x_ploidy' ploidy_ht = hl.vds.impute_sex_chromosome_ploidy( mtds, calling_intervals=included_intervals, normalization_contig=normalization_contig, use_variant_dataset=variants_only_x_ploidy, ) ploidy_ht = ploidy_ht.rename({ "x_ploidy": "chrX_ploidy", "y_ploidy": "chrY_ploidy", "x_mean_dp": "chrX_mean_dp", "y_mean_dp": "chrY_mean_dp", "autosomal_mean_dp": f"var_data_{normalization_contig}_mean_dp" if variants_only_x_ploidy else f"{normalization_contig}_mean_dp", }) # If 'variants_only_y_ploidy' is different from 'variants_only_x_ploidy' then re-run the ploidy estimation using # the method defined by 'variants_only_y_ploidy' and re-annotate with the modified ploidy estimates. if variants_only_y_ploidy != variants_only_x_ploidy: y_ploidy_ht = hl.vds.impute_sex_chromosome_ploidy( mtds, calling_intervals=included_intervals, normalization_contig=normalization_contig, use_variant_dataset=variants_only_y_ploidy, ) y_ploidy_idx = y_ploidy_ht[ploidy_ht.key] ploidy_ht = ploidy_ht.annotate( chrY_ploidy=y_ploidy_idx.y_ploidy, chrY_mean_dp=y_ploidy_idx.y_mean_dp, ) # If the `variants_only_y_ploidy' is True modify the name of the normalization contig mean DP to indicate # that this is the variant dataset only mean DP (this will have already been added if # 'variants_only_x_ploidy' was also True). if variants_only_y_ploidy: ploidy_ht = ploidy_ht.annotate( **{ f"var_data_{normalization_contig}_mean_dp": y_ploidy_idx.autosomal_mean_dp }) mt = mtds.variant_data else: mt = mtds if is_sparse: ploidy_ht = impute_sex_ploidy( mt, excluded_intervals, included_intervals, normalization_contig, use_only_variants=variants_only_x_ploidy, ) ploidy_ht = ploidy_ht.rename({ "autosomal_mean_dp": f"var_data_{normalization_contig}_mean_dp" if variants_only_x_ploidy else f"{normalization_contig}_mean_dp", }) # If 'variants_only_y_ploidy' is different from 'variants_only_x_ploidy' then re-run the ploidy estimation # using the method defined by 'variants_only_y_ploidy' and re-annotate with the modified ploidy estimates. if variants_only_y_ploidy != variants_only_x_ploidy: y_ploidy_ht = impute_sex_ploidy( mt, excluded_intervals, included_intervals, normalization_contig, use_only_variants=variants_only_y_ploidy, ) y_ploidy_ht.select( "chrY_ploidy", "chrY_mean_dp", f"{normalization_contig}_mean_dp", ) # If the `variants_only_y_ploidy' is True modify the name of the normalization contig mean DP to indicate # that this is the variant dataset only mean DP (this will have already been added if # 'variants_only_x_ploidy' was also True). if variants_only_y_ploidy: ploidy_ht = ploidy_ht.rename({ f"{normalization_contig}_mean_dp": f"var_data_{normalization_contig}_mean_dp" }) # Re-annotate the ploidy HT with modified Y ploidy annotations ploidy_ht = ploidy_ht.annotate(**y_ploidy_ht[ploidy_ht.key]) else: raise NotImplementedError( "Imputing sex ploidy does not exist yet for dense data.") x_contigs = get_reference_genome(mt.locus).x_contigs logger.info("Filtering mt to biallelic SNPs in X contigs: %s", x_contigs) if "was_split" in list(mt.row): mt = mt.filter_rows((~mt.was_split) & hl.is_snp(mt.alleles[0], mt.alleles[1])) else: mt = mt.filter_rows((hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1])) build = get_reference_genome(mt.locus).name mt = hl.filter_intervals( mt, [ hl.parse_locus_interval(contig, reference_genome=build) for contig in x_contigs ], keep=True, ) if sites_ht is not None: if aaf_expr == None: logger.warning( "sites_ht was provided, but aaf_expr is missing. Assuming name of field with alternate allele frequency is 'AF'." ) aaf_expr = "AF" logger.info("Filtering to provided sites") mt = mt.annotate_rows(**sites_ht[mt.row_key]) mt = mt.filter_rows(hl.is_defined(mt[aaf_expr])) logger.info("Calculating inbreeding coefficient on chrX") sex_ht = hl.impute_sex( mt[gt_expr], aaf_threshold=aaf_threshold, male_threshold=f_stat_cutoff, female_threshold=f_stat_cutoff, aaf=aaf_expr, ) logger.info("Annotating sex ht with sex chromosome ploidies") sex_ht = sex_ht.annotate(**ploidy_ht[sex_ht.key]) logger.info("Inferring sex karyotypes") x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs( sex_ht, f_stat_cutoff) sex_ht = sex_ht.annotate_globals( x_ploidy_cutoffs=hl.struct( upper_cutoff_X=x_ploidy_cutoffs[0], lower_cutoff_XX=x_ploidy_cutoffs[1][0], upper_cutoff_XX=x_ploidy_cutoffs[1][1], lower_cutoff_XXX=x_ploidy_cutoffs[2], ), y_ploidy_cutoffs=hl.struct( lower_cutoff_Y=y_ploidy_cutoffs[0][0], upper_cutoff_Y=y_ploidy_cutoffs[0][1], lower_cutoff_YY=y_ploidy_cutoffs[1], ), f_stat_cutoff=f_stat_cutoff, variants_only_x_ploidy=variants_only_x_ploidy, variants_only_y_ploidy=variants_only_y_ploidy, ) return sex_ht.annotate( **get_sex_expr(sex_ht.chrX_ploidy, sex_ht.chrY_ploidy, x_ploidy_cutoffs, y_ploidy_cutoffs))
f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-split-multi_checkpoint.mt", overwrite=True) print("Finished splitting and writing mt. ") mt = mt_split.annotate_rows( Variant_Type=hl.cond((hl.is_snp(mt_split.alleles[0], mt_split.alleles[1])), "SNP", hl.cond( hl.is_insertion( mt_split.alleles[0], mt_split.alleles[1]), "INDEL", hl.cond(hl.is_deletion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL", "Other")))) mt_sampleqc = hl.sample_qc(mt, name='sample_QC_Hail') panda_df_unfiltered_table = mt_sampleqc.cols().flatten() print("Sex imputation:") #mt2_sex = mt2.select_entries(GT=hl.unphased_diploid_gt_index_call(mt2.GT.n_alt_alleles())) imputed_sex = hl.impute_sex(mt_sampleqc.GT) # Annotate samples male or female: mt = mt_sampleqc.annotate_cols(sex=hl.cond( imputed_sex[mt_sampleqc.s].is_female, "female", "male")) print("Outputting table of sample qc") panda_df_unfiltered_table.export( f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}_sampleQC_unfiltered_sex_annotated.tsv.bgz", header=True) # mt2 = hl.variant_qc(mt_sampleqc, name='variant_QC_Hail') #print('Exporting variant qc pandas table to disk') # mt_rows = mt2.rows() # mt_rows.select(mt_rows.variant_QC_Hail).flatten().export(f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}_variantQC_unfiltered.tsv.bgz", # header=True)