def test_annotate_intervals(self): ds = get_dataset() bed1 = hl.import_bed(resource('example1.bed'), reference_genome='GRCh37') bed2 = hl.import_bed(resource('example2.bed'), reference_genome='GRCh37') bed3 = hl.import_bed(resource('example3.bed'), reference_genome='GRCh37') self.assertTrue(list(bed2.key.dtype) == ['interval']) self.assertTrue(list(bed2.row.dtype) == ['interval', 'target']) interval_list1 = hl.import_locus_intervals(resource('exampleAnnotation1.interval_list')) interval_list2 = hl.import_locus_intervals(resource('exampleAnnotation2.interval_list')) self.assertTrue(list(interval_list2.key.dtype) == ['interval']) self.assertTrue(list(interval_list2.row.dtype) == ['interval', 'target']) ann = ds.annotate_rows(in_interval=bed1[ds.locus]).rows() self.assertTrue(ann.all((ann.locus.position <= 14000000) | (ann.locus.position >= 17000000) | (hl.is_missing(ann.in_interval)))) for bed in [bed2, bed3]: ann = ds.annotate_rows(target=bed[ds.locus].target).rows() expr = (hl.case() .when(ann.locus.position <= 14000000, ann.target == 'gene1') .when(ann.locus.position >= 17000000, ann.target == 'gene2') .default(ann.target == hl.null(hl.tstr))) self.assertTrue(ann.all(expr)) self.assertTrue(ds.annotate_rows(in_interval=interval_list1[ds.locus]).rows() ._same(ds.annotate_rows(in_interval=bed1[ds.locus]).rows())) self.assertTrue(ds.annotate_rows(target=interval_list2[ds.locus].target).rows() ._same(ds.annotate_rows(target=bed2[ds.locus].target).rows()))
def test_import_locus_intervals_badly_defined_intervals(self): interval_file = resource('example3.interval_list') t = hl.import_locus_intervals(interval_file, reference_genome='GRCh37', skip_invalid_intervals=True) self.assertTrue(t.count() == 21) t = hl.import_locus_intervals(interval_file, reference_genome=None, skip_invalid_intervals=True) self.assertTrue(t.count() == 22)
def annotate_in_segdup(mt, genome_version="GRCh38"): if genome_version == "GRCh37": segdup_regions = hl.import_locus_intervals("gs://broad-dsp-spec-ops/scratch/weisburd/ref/GRCh37/GRCh37GenomicSuperDup.bed", reference_genome="GRCh37") elif genome_version == "GRCh38": segdup_regions = hl.import_locus_intervals("gs://broad-dsp-spec-ops/scratch/weisburd/ref/GRCh38/GRCh38GenomicSuperDup.without_decoys.bed", reference_genome="GRCh38") else: raise ValueError(f"Invalid genome version: {genome_version}") return mt.annotate_rows(info = mt.info.annotate( in_segdup=hl.is_defined(segdup_regions[mt.locus])))
def annotate_in_LCR(mt, genome_version="GRCh38"): if genome_version == "GRCh37": lcr_regions = hl.import_locus_intervals("gs://broad-dsp-spec-ops/scratch/weisburd/ref/GRCh37/grch37_LCRs_without_decoys.bed", reference_genome="GRCh37") elif genome_version == "GRCh38": lcr_regions = hl.import_locus_intervals("gs://broad-dsp-spec-ops/scratch/weisburd/ref/GRCh38/grch38_LCRs_without_decoys.bed", reference_genome="GRCh38") else: raise ValueError(f"Invalid genome version: {genome_version}") return mt.annotate_rows(info = mt.info.annotate( in_LCR=hl.is_defined(lcr_regions[mt.locus])))
def run_platform_imputation( mt: hl.MatrixTable, plat_min_cluster_size: int, plat_min_sample_size: int, plat_assignment_pcs: int, ) -> hl.Table: """ Run PCA using sample callrate across Broad's evaluation intervals and create Hail Table with platform PCs and assigned platform. :param MatrixTable mt: QC MatrixTable :param plat_min_cluster_size: min cluster size for HBDscan clustering :param plat_min_sample_size: min sample size for HBDscan clustering :param plat_assignment_pcs: Number PCs used for HBDscan clustering :return: Table with platform PCs and assigned platform :rtype: Table """ intervals = hl.import_locus_intervals( "gs://gcp-public-data--broad-references/hg38/v0/exome_evaluation_regions.v1.interval_list" ) callrate_mt = compute_callrate_mt(mt, intervals) eigenvalues, scores_ht, ignore = run_platform_pca(callrate_mt) plat_ht = assign_platform_from_pcs( scores_ht, hdbscan_min_cluster_size=plat_min_cluster_size, hdbscan_min_samples=plat_min_sample_size, ) plat_pcs = { f"plat_PC{i+1}": scores_ht.scores[i] for i in list(range(0, plat_assignment_pcs)) } scores_ht = scores_ht.annotate(**plat_pcs).drop("scores") plat_ht = plat_ht.annotate(**scores_ht[plat_ht.key]) return plat_ht
def run_pipeline(args): hl.init(log='./hail_annotation_pipeline.log') ''' rg = hl.get_reference('GRCh37') grch37_contigs = [x for x in rg.contigs if not x.startswith('GL') and not x.startswith('M')] contig_dict = dict(zip(grch37_contigs, ['chr'+x for x in grch37_contigs])) ''' exome_intervals = hl.import_locus_intervals( '/gpfs/ycga/project/lek/shared/resources/hg38/exome_evaluation_regions.v1.interval_list', reference_genome='GRCh38') #mt = hl.import_vcf(args.vcf,reference_genome='GRCh38',contig_recoding=contig_dict,array_elements_required=False,force_bgz=True,filter='MONOALLELIC') mt = hl.import_vcf(args.vcf, reference_genome='GRCh38', array_elements_required=False, force_bgz=True, filter='MONOALLELIC') mt = mt.filter_rows(hl.is_defined(exome_intervals[mt.locus])) pprint.pprint(mt.describe()) pprint.pprint(mt.show()) mt = mt.repartition(hl.eval(hl.int(mt.n_partitions() / 10))) mt.write(args.out, overwrite=True)
def filter_out_segdups(mt, genome_version="GRCh38"): if genome_version == "GRCh38": segdup_regions = hl.import_locus_intervals("gs://broad-dsp-spec-ops/scratch/weisburd/ref/GRCh38/GRCh38GenomicSuperDup.without_decoys.bed", reference_genome="GRCh38") else: raise ValueError(f"Invalid genome version: {genome_version}") return mt.filter_rows(hl.is_missing(segdup_regions[mt.locus]))
def test_annotate_intervals(self): ds = get_dataset() bed1 = hl.import_bed(resource('example1.bed'), reference_genome='GRCh37') bed2 = hl.import_bed(resource('example2.bed'), reference_genome='GRCh37') bed3 = hl.import_bed(resource('example3.bed'), reference_genome='GRCh37') self.assertTrue(list(bed2.key.dtype) == ['interval']) self.assertTrue(list(bed2.row.dtype) == ['interval', 'target']) interval_list1 = hl.import_locus_intervals( resource('exampleAnnotation1.interval_list')) interval_list2 = hl.import_locus_intervals( resource('exampleAnnotation2.interval_list')) self.assertTrue(list(interval_list2.key.dtype) == ['interval']) self.assertTrue( list(interval_list2.row.dtype) == ['interval', 'target']) ann = ds.annotate_rows(in_interval=bed1[ds.locus]).rows() self.assertTrue( ann.all((ann.locus.position <= 14000000) | (ann.locus.position >= 17000000) | (hl.is_missing(ann.in_interval)))) for bed in [bed2, bed3]: ann = ds.annotate_rows(target=bed[ds.locus].target).rows() expr = (hl.case().when(ann.locus.position <= 14000000, ann.target == 'gene1').when( ann.locus.position >= 17000000, ann.target == 'gene2').default( ann.target == hl.null(hl.tstr))) self.assertTrue(ann.all(expr)) self.assertTrue( ds.annotate_rows( in_interval=interval_list1[ds.locus]).rows()._same( ds.annotate_rows(in_interval=bed1[ds.locus]).rows())) self.assertTrue( ds.annotate_rows( target=interval_list2[ds.locus].target).rows()._same( ds.annotate_rows(target=bed2[ds.locus].target).rows()))
def get_lcr_ht(overwrite: bool = False) -> hl.Table: lcr_interval = hl.import_locus_intervals( f'{nfs_dir}/resources/grch38/LCRFromHengHg38.txt.bgz', skip_invalid_intervals=True, min_partitions=50, reference_genome='GRCh38') return lcr_interval.checkpoint( f'{nfs_dir}/resources/grch38/LCRFromHengHg38.ht', overwrite=overwrite, _read_if_exists=not overwrite)
def filter_low_conf_regions( mt: Union[hl.MatrixTable, hl.Table], filter_lcr: bool = True, filter_decoy: bool = True, filter_segdup: bool = True, filter_exome_low_coverage_regions: bool = False, high_conf_regions: Optional[List[str]] = None, ) -> Union[hl.MatrixTable, hl.Table]: """ Filters low-confidence regions :param mt: MatrixTable or Table to filter :param filter_lcr: Whether to filter LCR regions :param filter_decoy: Whether to filter decoy regions :param filter_segdup: Whether to filter Segdup regions :param filter_exome_low_coverage_regions: Whether to filter exome low confidence regions :param high_conf_regions: Paths to set of high confidence regions to restrict to (union of regions) :return: MatrixTable or Table with low confidence regions removed """ build = get_reference_genome(mt.locus).name if build == "GRCh37": import gnomad.resources.grch37.reference_data as resources elif build == "GRCh38": import gnomad.resources.grch38.reference_data as resources criteria = [] if filter_lcr: lcr = resources.lcr_intervals.ht() criteria.append(hl.is_missing(lcr[mt.locus])) if filter_decoy: decoy = resources.decoy_intervals.ht() criteria.append(hl.is_missing(decoy[mt.locus])) if filter_segdup: segdup = resources.seg_dup_intervals.ht() criteria.append(hl.is_missing(segdup[mt.locus])) if filter_exome_low_coverage_regions: high_cov = resources.high_coverage_intervals.ht() criteria.append(hl.is_missing(high_cov[mt.locus])) if high_conf_regions is not None: for region in high_conf_regions: region = hl.import_locus_intervals(region) criteria.append(hl.is_defined(region[mt.locus])) if criteria: filter_criteria = functools.reduce(operator.iand, criteria) if isinstance(mt, hl.MatrixTable): mt = mt.filter_rows(filter_criteria) else: mt = mt.filter(filter_criteria) return mt
def test_import_locus_intervals(self): interval_file = resource('annotinterall.interval_list') intervals = hl.import_locus_intervals(interval_file, reference_genome='GRCh37') nint = intervals.count() i = 0 with open(interval_file) as f: for line in f: if len(line.strip()) != 0: i += 1 self.assertEqual(nint, i) self.assertEqual(intervals.interval.dtype.point_type, hl.tlocus('GRCh37'))
def _import_purcell_5k(path) -> hl.Table: p5k = hl.import_locus_intervals(path, reference_genome='GRCh37') rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') if not rg37.has_liftover('GRCh38'): rg37.add_liftover( 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) p5k = p5k.annotate(start=hl.liftover(p5k.interval.start, 'GRCh38'), end=hl.liftover(p5k.interval.start, 'GRCh38')) p5k = p5k.filter((p5k.start.contig == 'chr' + p5k.interval.start.contig) & (p5k.end.contig == 'chr' + p5k.interval.end.contig)) p5k = p5k.key_by() p5k = p5k.select(locus=p5k.start, locus_b37=p5k.interval.start) return p5k.key_by('locus')
def filter_low_conf_regions( mt: hl.MatrixTable, filter_lcr: bool = True, filter_decoy: bool = True, filter_segdup: bool = True, high_conf_regions: Optional[List[str]] = None) -> hl.MatrixTable: """ Filters low-confidence regions :param MatrixTable mt: MT to filter :param bool filter_lcr: Whether to filter LCR regions :param bool filter_decoy: Whether to filter decoy regions :param bool filter_segdup: Whether to filter Segdup regions :param list of str high_conf_regions: Paths to set of high confidence regions to restrict to (union of regions) :return: MT with low confidence regions removed :rtype: MatrixTable """ from gnomad_hail.resources import lcr_intervals_path, decoy_intervals_path, segdup_intervals_path if filter_lcr: lcr = hl.import_locus_intervals(lcr_intervals_path) mt = mt.filter_rows(hl.is_defined(lcr[mt.locus]), keep=False) if filter_decoy: decoy = hl.import_bed(decoy_intervals_path) mt = mt.filter_rows(hl.is_defined(decoy[mt.locus]), keep=False) if filter_segdup: segdup = hl.import_bed(segdup_intervals_path) mt = mt.filter_rows(hl.is_defined(segdup[mt.locus]), keep=False) if high_conf_regions is not None: for region in high_conf_regions: region = hl.import_locus_intervals(region) mt = mt.filter_rows(hl.is_defined(region[mt.locus]), keep=True) return mt
def test_import_locus_intervals(self): interval_file = resource('annotinterall.interval_list') t = hl.import_locus_intervals(interval_file, reference_genome='GRCh37') nint = t.count() i = 0 with open(interval_file) as f: for line in f: if len(line.strip()) != 0: i += 1 self.assertEqual(nint, i) self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37')) tmp_file = new_temp_file(prefix="test", suffix="interval_list") start = t.interval.start end = t.interval.end (t .key_by(interval=hl.locus_interval(start.contig, start.position, end.position, True, True)) .select() .export(tmp_file, header=False)) t2 = hl.import_locus_intervals(tmp_file) self.assertTrue(t.select()._same(t2))
def _import_purcell_5k(path) -> hl.Table: p5k = hl.import_locus_intervals(path, reference_genome="GRCh37") rg37 = hl.get_reference("GRCh37") rg38 = hl.get_reference("GRCh38") if not rg37.has_liftover("GRCh38"): rg37.add_liftover("references/grch37_to_grch38.over.chain.gz", rg38) p5k = p5k.annotate( start=hl.liftover(p5k.interval.start, "GRCh38"), end=hl.liftover(p5k.interval.start, "GRCh38"), ) p5k = p5k.filter((p5k.start.contig == "chr" + p5k.interval.start.contig) & (p5k.end.contig == "chr" + p5k.interval.end.contig)) p5k = p5k.key_by() p5k = p5k.select(locus=p5k.start, locus_b37=p5k.interval.start) return p5k.key_by("locus")
def test_import_locus_intervals(self): interval_file = resource('annotinterall.interval_list') t = hl.import_locus_intervals(interval_file, reference_genome='GRCh37') nint = t.count() i = 0 with open(interval_file) as f: for line in f: if len(line.strip()) != 0: i += 1 self.assertEqual(nint, i) self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37')) tmp_file = new_temp_file(prefix="test", suffix="interval_list") start = t.interval.start end = t.interval.end (t.key_by( interval=hl.locus_interval(start.contig, start.position, end. position, True, True)).select().export( tmp_file, header=False)) t2 = hl.import_locus_intervals(tmp_file) self.assertTrue(t.select()._same(t2))
def test_skat(self): ds2 = hl.import_vcf(resource('sample2.vcf')) covariatesSkat = (hl.import_table(resource("skat.cov"), impute=True) .key_by("Sample")) phenotypesSkat = (hl.import_table(resource("skat.pheno"), types={"Pheno": hl.tfloat64}, missing="0") .key_by("Sample")) intervalsSkat = (hl.import_locus_intervals(resource("skat.interval_list"))) weightsSkat = (hl.import_table(resource("skat.weights"), types={"locus": hl.tlocus(), "weight": hl.tfloat64}) .key_by("locus")) ds = hl.split_multi_hts(ds2) ds = ds.annotate_rows(gene=intervalsSkat[ds.locus], weight=weightsSkat[ds.locus].weight) ds = ds.annotate_cols(pheno=phenotypesSkat[ds.s].Pheno, cov=covariatesSkat[ds.s]) ds = ds.annotate_cols(pheno=hl.cond(ds.pheno == 1.0, False, hl.cond(ds.pheno == 2.0, True, hl.null(hl.tbool)))) hl.skat(ds, key_expr=ds.gene, weight_expr=ds.weight, y=ds.pheno, x=ds.GT.n_alt_alleles(), covariates=[ds.cov.Cov1, ds.cov.Cov2], logistic=False).count() hl.skat(ds, key_expr=ds.gene, weight_expr=ds.weight, y=ds.pheno, x=hl.pl_dosage(ds.PL), covariates=[ds.cov.Cov1, ds.cov.Cov2], logistic=True).count()
def main(args): hl.init(log='/platform_pca.log') if not args.skip_prepare_data_for_platform_pca: # ~1 hour on 800 cores (3/8/18) logger.info('Preparing data for platform PCA...') mt = get_gnomad_data('exomes', adj=True, raw=False, meta_root=None, fam_root=None, split=False) mt = filter_to_autosomes(mt) intervals = hl.import_locus_intervals(evaluation_intervals_path) mt = mt.annotate_rows(interval=intervals[mt.locus].target) mt = mt.filter_rows(hl.is_defined(mt.interval) & (hl.len(mt.alleles) == 2)) mt = mt.select_entries(GT=hl.or_missing(hl.is_defined(mt.GT), hl.struct())) callrate_mt = mt.group_rows_by(mt.interval).aggregate(callrate=hl.agg.fraction(hl.is_defined(mt.GT))) callrate_mt.write(exome_callrate_mt_path, args.overwrite) if not args.skip_run_platform_pca: logger.info('Running platform PCA...') qc_ht = hl.read_table(qc_ht_path('exomes', 'hard_filters')).key_by('s') callrate_mt = hl.read_matrix_table(exome_callrate_mt_path) callrate_mt = callrate_mt.filter_cols(hl.len(qc_ht[callrate_mt.col_key].hard_filters) == 0) callrate_mt = callrate_mt.annotate_entries(callrate=hl.int(callrate_mt.callrate > 0.25)) # Center until Hail's PCA does it for you callrate_mt = callrate_mt.annotate_rows(mean_callrate=hl.agg.mean(callrate_mt.callrate)) callrate_mt = callrate_mt.annotate_entries(callrate=callrate_mt.callrate - callrate_mt.mean_callrate) eigenvalues, scores, _ = hl.pca(callrate_mt.callrate, compute_loadings=False) logger.info('Eigenvalues: {}'.format(eigenvalues)) # [731282566.2824697, 78687228.90071851, 43837650.51729764, 33969298.61827205, 26308703.539534636, 21102437.512725923, 16949828.555817757, 12994894.187041137, 8372332.274295175, 8128326.814388647] scores.write(exome_callrate_scores_ht_path) logger.info('Annotating with platform PCs and known platform annotations...') scores = hl.read_table(exome_callrate_scores_ht_path).annotate(data_type='exomes') if args.pc_scores_in_separate_fields: scores = scores.transmute(scores=[ scores[ann] for ann in sorted( [ann for ann in scores.row if ann.startswith("PC")], key=lambda x: int(x[2:]) ) ]) platform_pcs = assign_platform_pcs(scores) platform_pcs.write(qc_ht_path('exomes', 'platforms'), overwrite=args.overwrite)
def write_truth_concordance(data_type: str, truth_sample: str, overwrite: bool) -> None: sample_mapping = { 'NA12878': { 'exomes': 'C1975::NA12878', 'genomes': 'G94982_NA12878' }, 'syndip': { 'exomes': 'CHMI_CHMI3_Nex1', 'genomes': 'CHMI_CHMI3_WGS1' } } mt = get_qc_samples_filtered_gnomad_data(data_type, autosomes_only=False) mt = mt.filter_cols(mt.s == sample_mapping[truth_sample][data_type]) mt = mt.annotate_entries(GT=unphase_call_expr(mt.GT)) mt = mt.key_cols_by(s=hl.str(truth_sample)) mt = mt.repartition(1000 if data_type == 'genomes' else 100, shuffle=False) truth_mt = hl.read_matrix_table(NA12878_mt_path() if truth_sample == 'NA12878' else syndip_mt_path()) truth_mt = truth_mt.key_cols_by(s=hl.str(truth_sample)) if data_type == 'exomes': exome_calling_intervals = hl.import_locus_intervals( exome_calling_intervals_path, skip_invalid_intervals=True) truth_mt = truth_mt.filter_rows( hl.is_defined(exome_calling_intervals[truth_mt.locus])) truth_mt = hl.split_multi_hts(truth_mt, left_aligned=False) truth_mt = truth_mt.annotate_entries(GT=unphase_call_expr(truth_mt.GT)) sample_concordance_ht, sites_concordance_ht = compute_concordance( mt, truth_mt, name=truth_sample) sites_concordance_ht.write(annotations_ht_path( data_type, f'{truth_sample}_concordance'), overwrite=overwrite) sample_concordance_ht.write(sample_annotations_table_path( data_type, f'{truth_sample}_concordance'), overwrite=overwrite)
def hailthread(cond1, q, cond2, qcm, inputDir, outputDir, qaws_size): #Load id_conversion file #table_idconv=hl.import_table('id_conversion') #Load markers files #table_makers_pos=hl.import_table('800k_to_extract_indexed2.txt',delimiter=':',no_header=True,impute=True) #table_markers_all=hl.import_table('800k_to_extract_indexed_alleles_gt2.txt',delimiter=':',no_header=True,impute=True) #cut -f 1 -d',' 800k_to_extract_indexed2.txt > interval_table #awk -F':' '{print $1"\t"$2"\t"$2}' interval_table > interval_table2 hl.init() cond1.acquire() while not an_item_is_available(q): #print("Thread hail to sleep") #time.sleep(300) print("Thread hail to wait") cond1.wait() file = get_an_available_item(q) print("Thread hail get item " + file) qaws_size = qaws_size - 1 cond1.release() interval_table = hl.import_locus_intervals('interval_table2', reference_genome='GRCh38') while file != "END": fileParts = file.split("/")[-1] fileName = fileParts.replace(".vcf.gz", "").replace(".gvcf.gz", "") chrName = fileName.split("_")[-3] #myFNAL=fileName.split("\\.") #myTempId=myFNAL[0] #Load gVCF file #data=hl.import_vcf("/mnt/vol1/java/gel_test.vcf",force_bgz=True,reference_genome='GRCh38') #data=hl.import_vcf("/mnt/vol1/java/gel_mainProgramme_aggV2_chr10_129040437_131178399.vcf.gz",force_bgz=True,reference_genome='GRCh38') try: #Extract INFO fields data = hl.import_vcf(inputDir + "/" + fileParts, force_bgz=True, reference_genome='GRCh38', drop_samples=True) #Filters PASS if chrName != "chrY": data = data.filter_rows(data.filters.size() > 0, keep=False) #Multiallelic data = hl.split_multi_hts(data) #Join with markers data_filtered = data.filter_rows( hl.is_defined(interval_table[data.locus])) data_sr = data_filtered.select_rows( data_filtered.info.medianDepthAll, data_filtered.info.medianDepthNonMiss, data_filtered.info.medianGQ, data_filtered.info.missingness, data_filtered.info.completeGTRatio, data_filtered.info.ABratio, data_filtered.info.MendelSite, data_filtered.info.AN, data_filtered.info.AC, data_filtered.info.AC_Hom, data_filtered.info.AC_Het) ht = data_sr.make_table() ht.export(outputDir + "/" + fileName + "_INFO.tsv") os.system("sed -i 's/\[//g' " + outputDir + "/" + fileName + "_INFO.tsv") os.system("sed -i 's/]//g' " + outputDir + "/" + fileName + "_INFO.tsv") os.system("cat " + outputDir + "/" + fileName + "_INFO.tsv | grep -v locus " + " >> " + outputDir + "/INFO_" + chrName) os.system("rm " + inputDir + "/" + fileParts) cond2.acquire() print("Thread hail make item available " + fileName) make_an_item_available(qcm, file) cond2.notify_all() cond2.release() except FatalError as e: print("Exception2 in file:" + file) os.system("rm " + inputDir + "/" + fileParts) except AssertionError as e: print("Exception3 in file:" + file) os.system("rm " + inputDir + "/" + fileParts) except Exception as e: print("Exception in file:" + file) os.system("rm " + inputDir + "/" + fileParts) #raise Exception cond1.acquire() while not an_item_is_available(q): #print("Thread hail to sleep") #time.sleep(300) print("Thread hail to wait") cond1.wait() file = get_an_available_item(q) print("Thread hail get item " + file) qaws_size = qaws_size - 1 cond1.release() time.sleep(300) cond2.acquire() print("Thread hail make END available") make_an_item_available(qcm, "END") cond2.notify_all() cond2.release()
def write_omes_concordance(data_type: str, dup_version: str, by_platform: bool, overwrite: bool) -> None: other_data_type = 'exomes' if data_type == 'genomes' else 'genomes' mt = get_qc_samples_filtered_gnomad_data(data_type) other_mt = get_qc_samples_filtered_gnomad_data(other_data_type) dup_ht = hl.import_table( genomes_exomes_duplicate_ids_tsv_path(dup_version), impute=True) dup_ht = dup_ht.filter(dup_ht.dup_pair_rank == 0) # Unify sample names based on inferred duplicates (from pc_relate) mt = mt.filter_cols(hl.is_defined(dup_ht.key_by(f'{data_type}_s')[mt.s])) mt = mt.annotate_entries(GT=unphase_call_expr(mt.GT)) other_mt = other_mt.key_cols_by( s=dup_ht.key_by(f'{other_data_type}_s')[other_mt.s][f'{data_type}_s']) other_mt = other_mt.filter_cols(hl.is_defined(other_mt.s)) other_mt = other_mt.annotate_entries(GT=unphase_call_expr(other_mt.GT)) exome_calling_intervals = hl.import_locus_intervals( exome_calling_intervals_path, skip_invalid_intervals=True) mt = mt.filter_rows(hl.is_defined(exome_calling_intervals[mt.locus])) other_mt = other_mt.filter_rows( hl.is_defined(exome_calling_intervals[other_mt.locus])) if by_platform: omes_conc = hl.read_table( annotations_ht_path(data_type, "omes_concordance")) omes_conc = omes_conc.transmute(concordance=[ hl.struct(concordance_matrix=omes_conc.concordance, n_discordant=omes_conc.n_discordant, meta={ 'data_type': 'omes', 'platform': 'all' }) ]) platforms = mt.aggregate_cols( hl.agg.collect_as_set(mt.meta.qc_platform)) logger.info( "Computing concordance by platform for platforms: {}".format( ",".join([str(x) for x in platforms]))) for platform in platforms: plat_mt = mt.filter_cols(mt.meta.qc_platform == platform) _, sites_concordance_ht = compute_concordance( plat_mt, other_mt, name=f'omes (platform: {platform})') omes_conc = omes_conc.annotate( concordance=omes_conc.concordance.append( hl.struct(concordance_matrix=sites_concordance_ht[ omes_conc.key].concordance, n_discordant=sites_concordance_ht[ omes_conc.key].n_discordant, meta={ 'data_type': 'omes', 'platform': hl.str(platform) }))) omes_conc.write(annotations_ht_path(data_type, "omes_by_platform_concordance"), overwrite=overwrite) else: sample_concordance_ht, sites_concordance_ht = compute_concordance( mt, other_mt, name='omes') sites_concordance_ht.write(annotations_ht_path(data_type, "omes_concordance"), overwrite=overwrite) sample_concordance_ht.write(sample_annotations_table_path( data_type, "omes_concordance"), overwrite=overwrite)
def create_binned_concordance(data_type: str, truth_sample: str, metric: str, nbins: int, overwrite: bool) -> None: """ Creates and writes a concordance table binned by rank (both absolute and relative) for a given data type, truth sample and metric. :param str data_type: One 'exomes' or 'genomes' :param str truth_sample: Which truth sample concordance to load :param str metric: One of the evaluation metrics (or a RF hash) :param int nbins: Number of bins for the rank :param bool overwrite: Whether to overwrite existing table :return: Nothing -- just writes the table :rtype: None """ if hl.hadoop_exists( binned_concordance_path(data_type, truth_sample, metric) + '/_SUCCESS') and not overwrite: logger.warn( f"Skipping binned concordance creation as {binned_concordance_path(data_type, truth_sample, metric)} exists and overwrite=False" ) else: ht = hl.read_table( annotations_ht_path(data_type, f'{truth_sample}_concordance')) # Remove 1bp indels for syndip as cannot be trusted if truth_sample == 'syndip': ht = ht.filter( hl.is_indel(ht.alleles[0], ht.alleles[1]) & (hl.abs(hl.len(ht.alleles[0]) - hl.len(ht.alleles[1])) == 1), keep=False) high_conf_intervals = hl.import_locus_intervals( syndip_high_conf_regions_bed_path) else: high_conf_intervals = hl.import_locus_intervals( NA12878_high_conf_regions_bed_path) lcr = hl.import_locus_intervals(lcr_intervals_path) segdup = hl.import_locus_intervals(segdup_intervals_path) ht = ht.filter( hl.is_defined(high_conf_intervals[ht.locus]) & hl.is_missing(lcr[ht.locus]) & hl.is_missing(segdup[ht.locus])) if metric in ['vqsr', 'rf_2.0.2', 'rf_2.0.2_beta', 'cnn']: metric_ht = hl.read_table(score_ranking_path(data_type, metric)) else: metric_ht = hl.read_table( rf_path(data_type, 'rf_result', run_hash=metric)) metric_snvs, metrics_indels = metric_ht.aggregate([ hl.agg.count_where( hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1])), hl.agg.count_where( ~hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1])) ]) snvs, indels = ht.aggregate([ hl.agg.count_where(hl.is_snp(ht.alleles[0], ht.alleles[1])), hl.agg.count_where(~hl.is_snp(ht.alleles[0], ht.alleles[1])) ]) ht = ht.annotate_globals(global_counts=hl.struct( snvs=metric_snvs, indels=metrics_indels), counts=hl.struct(snvs=snvs, indels=indels)) ht = ht.annotate( snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), score=metric_ht[ht.key].score, global_rank=metric_ht[ht.key].rank, # TP => allele is found in both data sets n_tp=ht.concordance[3][3] + ht.concordance[3][4] + ht.concordance[4][3] + ht.concordance[4][4], # FP => allele is found only in test data set n_fp=hl.sum(ht.concordance[3][:2]) + hl.sum(ht.concordance[4][:2]), # FN => allele is found only in truth data set n_fn=hl.sum(ht.concordance[:2].map(lambda x: x[3] + x[4]))) ht = add_rank(ht, -1.0 * ht.score) ht = ht.annotate(rank=[ hl.tuple([ 'global_rank', (ht.global_rank + 1) / hl.cond(ht.snv, ht.globals.global_counts.snvs, ht.globals.global_counts.indels) ]), hl.tuple([ 'truth_sample_rank', (ht.rank + 1) / hl.cond( ht.snv, ht.globals.counts.snvs, ht.globals.counts.indels) ]) ]) ht = ht.explode(ht.rank) ht = ht.annotate(rank_name=ht.rank[0], bin=hl.int(ht.rank[1] * nbins)) ht = ht.group_by('rank_name', 'snv', 'bin').aggregate( # Look at site-level metrics -> tp > fp > fn -- only important for multi-sample comparisons tp=hl.agg.count_where(ht.n_tp > 0), fp=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp > 0)), fn=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp == 0) & (ht.n_fn > 0)), min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n_alleles=hl.agg.count()).repartition(5) ht.write(binned_concordance_path(data_type, truth_sample, metric), overwrite=overwrite)
age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals(global_field_1=5, global_field_2=10, pli={'SCN1A': 0.999, 'SONIC': 0.014}, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds.write('data/example.vds', overwrite=True) lmmreg_ds = hl.variant_qc(hl.split_multi_hts(hl.import_vcf('data/sample.vcf.bgz'))) lmmreg_tsv = hl.import_table('data/example_lmmreg.tsv', 'Sample', impute=True) lmmreg_ds = lmmreg_ds.annotate_cols(**lmmreg_tsv[lmmreg_ds['s']]) lmmreg_ds = lmmreg_ds.annotate_rows(use_in_kinship = lmmreg_ds.variant_qc.AF[1] > 0.05) lmmreg_ds.write('data/example_lmmreg.vds', overwrite=True) burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden = burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows(weight = hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds.write('data/example_burden.vds', overwrite=True)
global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds.write('data/example.vds', overwrite=True) lmmreg_ds = hl.variant_qc( hl.split_multi_hts(hl.import_vcf('data/sample.vcf.bgz'))) lmmreg_tsv = hl.import_table('data/example_lmmreg.tsv', 'Sample', impute=True) lmmreg_ds = lmmreg_ds.annotate_cols(**lmmreg_tsv[lmmreg_ds['s']]) lmmreg_ds = lmmreg_ds.annotate_rows( use_in_kinship=lmmreg_ds.variant_qc.AF[1] > 0.05) lmmreg_ds.write('data/example_lmmreg.vds', overwrite=True) burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds.write('data/example_burden.vds', overwrite=True)
def get_lcr_intervals() -> hl.Table: return hl.import_locus_intervals('gs://gnomad-public/resources/grch38/LCRFromHengHg38.txt', reference_genome='GRCh38', skip_invalid_intervals=True)
pprint(n_6_multi) # 02_prefilter_variants.py INITIAL_VARIANT_LIST = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/02_prefilter.keep.variant_list' INITIAL_VARIANT_QC_FILE = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/02_prefilter_metrics.tsv' # Read in the target intervals TARGET_INTERVALS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/inputs/ice_coding_v1_targets.interval_list' # Read in the padded target intervals (50bp padding) PADDED_TARGET_INTERVALS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/inputs/ice_coding_v1_padded_targets.interval_list' # Low complexity regions in the data. LCRs = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/inputs/LCR-hs38.bed' # Import the interval lists for the target intervals. target_intervals = hl.import_locus_intervals(TARGET_INTERVALS, reference_genome='GRCh38') # Import the interval lists for the padded target intervals. padded_target_intervals = hl.import_locus_intervals(PADDED_TARGET_INTERVALS, reference_genome='GRCh38') # Import the interval lists for the LCRs. LCR_intervals = hl.import_locus_intervals(LCRs, reference_genome='GRCh38') # Annotate variants with flag indicating if they are in LCR or failed VQSR. mt = mt.annotate_rows(fail_VQSR = hl.len(mt.filters) != 0) mt = mt.annotate_rows(in_LCR = hl.is_defined(LCR_intervals[mt.locus])) mt = mt.annotate_rows(not_in_target_intervals = ~hl.is_defined(target_intervals[mt.locus])) mt = mt.annotate_rows(not_in_padded_target_intervals = ~hl.is_defined(padded_target_intervals[mt.locus])) # Get information about the number of variants that were excluded. fail_VQSR = mt.filter_rows(mt.fail_VQSR).count_rows() in_LCR = mt.filter_rows(mt.in_LCR).count_rows() not_in_target_intervals = mt.filter_rows(mt.not_in_target_intervals).count_rows()
def test_import_locus_intervals_no_reference_specified(self): interval_file = resource('annotinterall.interval_list') t = hl.import_locus_intervals(interval_file, reference_genome=None) self.assertTrue(t.count() == 2) self.assertEqual(t.interval.dtype.point_type, hl.tstruct(contig=hl.tstr, position=hl.tint32))
def create_binned_data(ht: hl.Table, data: str, data_type: str, n_bins: int) -> hl.Table: """ Creates binned data from a rank Table grouped by rank_id (rank, biallelic, etc.), contig, snv, bi_allelic and singleton containing the information needed for evaluation plots. :param Table ht: Input rank table :param str data: Which data/run hash is being created :param str data_type: one of 'exomes' or 'genomes' :param int n_bins: Number of bins. :return: Binned Table :rtype: Table """ # Count variants for ranking count_expr = { x: hl.agg.filter( hl.is_defined(ht[x]), hl.agg.counter( hl.cond(hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel'))) for x in ht.row if x.endswith('rank') } rank_variant_counts = ht.aggregate(hl.Struct(**count_expr)) logger.info( f"Found the following variant counts:\n {pformat(rank_variant_counts)}" ) ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts) # Load external evaluation data clinvar_ht = hl.read_table(clinvar_ht_path) denovo_ht = get_validated_denovos_ht() if data_type == 'exomes': denovo_ht = denovo_ht.filter(denovo_ht.gnomad_exomes.high_quality) else: denovo_ht = denovo_ht.filter(denovo_ht.gnomad_genomes.high_quality) denovo_ht = denovo_ht.select( validated_denovo=denovo_ht.validated, high_confidence_denovo=denovo_ht.Confidence == 'HIGH') ht_truth_data = hl.read_table(annotations_ht_path(data_type, 'truth_data')) fam_ht = hl.read_table(annotations_ht_path(data_type, 'family_stats')) fam_ht = fam_ht.select(family_stats=fam_ht.family_stats[0]) gnomad_ht = get_gnomad_data(data_type).rows() gnomad_ht = gnomad_ht.select( vqsr_negative_train_site=gnomad_ht.info.NEGATIVE_TRAIN_SITE, vqsr_positive_train_site=gnomad_ht.info.POSITIVE_TRAIN_SITE, fail_hard_filters=(gnomad_ht.info.QD < 2) | (gnomad_ht.info.FS > 60) | (gnomad_ht.info.MQ < 30)) lcr_intervals = hl.import_locus_intervals(lcr_intervals_path) ht = ht.annotate( **ht_truth_data[ht.key], **fam_ht[ht.key], **gnomad_ht[ht.key], **denovo_ht[ht.key], clinvar=hl.is_defined(clinvar_ht[ht.key]), indel_length=hl.abs(ht.alleles[0].length() - ht.alleles[1].length()), rank_bins=hl.array([ hl.Struct( rank_id=rank_name, bin=hl.int( hl.ceil( hl.float(ht[rank_name] + 1) / hl.floor( ht.globals.rank_variant_counts[rank_name][hl.cond( hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel')] / n_bins)))) for rank_name in rank_variant_counts ]), lcr=hl.is_defined(lcr_intervals[ht.locus])) ht = ht.explode(ht.rank_bins) ht = ht.transmute(rank_id=ht.rank_bins.rank_id, bin=ht.rank_bins.bin) ht = ht.filter(hl.is_defined(ht.bin)) ht = ht.checkpoint( f'gs://gnomad-tmp/gnomad_score_binning_{data_type}_tmp_{data}.ht', overwrite=True) # Create binned data return (ht.group_by( rank_id=ht.rank_id, contig=ht.locus.contig, snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), bi_allelic=hl.is_defined(ht.biallelic_rank), singleton=ht.singleton, release_adj=ht.ac > 0, bin=ht.bin)._set_buffer_size(20000).aggregate( min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n=hl.agg.count(), n_ins=hl.agg.count_where( hl.is_insertion(ht.alleles[0], ht.alleles[1])), n_del=hl.agg.count_where( hl.is_deletion(ht.alleles[0], ht.alleles[1])), n_ti=hl.agg.count_where( hl.is_transition(ht.alleles[0], ht.alleles[1])), n_tv=hl.agg.count_where( hl.is_transversion(ht.alleles[0], ht.alleles[1])), n_1bp_indel=hl.agg.count_where(ht.indel_length == 1), n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0), n_clinvar=hl.agg.count_where(ht.clinvar), n_singleton=hl.agg.count_where(ht.singleton), n_validated_de_novos=hl.agg.count_where(ht.validated_denovo), n_high_confidence_de_novos=hl.agg.count_where( ht.high_confidence_denovo), n_de_novo=hl.agg.filter( ht.family_stats.unrelated_qc_callstats.AC[1] == 0, hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_no_lcr=hl.agg.filter( ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_sites=hl.agg.filter( ht.family_stats.unrelated_qc_callstats.AC[1] == 0, hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_de_novo_sites_no_lcr=hl.agg.filter( ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_trans_singletons=hl.agg.filter( (ht.info_ac < 3) & (ht.family_stats.unrelated_qc_callstats.AC[1] == 1), hl.agg.sum(ht.family_stats.tdt.t)), n_untrans_singletons=hl.agg.filter( (ht.info_ac < 3) & (ht.family_stats.unrelated_qc_callstats.AC[1] == 1), hl.agg.sum(ht.family_stats.tdt.u)), n_train_trans_singletons=hl.agg.count_where( (ht.family_stats.unrelated_qc_callstats.AC[1] == 1) & (ht.family_stats.tdt.t == 1)), n_omni=hl.agg.count_where(ht.truth_data.omni), n_mills=hl.agg.count_where(ht.truth_data.mills), n_hapmap=hl.agg.count_where(ht.truth_data.hapmap), n_kgp_high_conf_snvs=hl.agg.count_where( ht.truth_data.kgp_high_conf_snvs), fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters), n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site), n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site)))
def test_import_locus_intervals_no_reference_specified(self): interval_file = resource('annotinterall.interval_list') t = hl.import_locus_intervals(interval_file, reference_genome=None) self.assertEqual(t.interval.dtype.point_type, hl.tstruct(contig=hl.tstr, position=hl.tint32))
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl doctest_namespace['np'] = np ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'output/example.mt', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata small_mt = hl.balding_nichols_model(3, 4, 4) doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt', overwrite=True) # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'output/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds ld_score_one_pheno_sumstats = hl.import_table( 'data/ld_score_regression.one_pheno.sumstats.tsv', types={ 'locus': hl.tlocus('GRCh37'), 'alleles': hl.tarray(hl.tstr), 'chi_squared': hl.tfloat64, 'n': hl.tint32, 'ld_score': hl.tfloat64, 'phenotype': hl.tstr, 'chi_squared_50_irnt': hl.tfloat64, 'n_50_irnt': hl.tint32, 'chi_squared_20160': hl.tfloat64, 'n_20160': hl.tint32 }, key=['locus', 'alleles']) doctest_namespace[ 'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats mt = hl.import_matrix_table( 'data/ld_score_regression.all_phenos.sumstats.tsv', row_fields={ 'locus': hl.tstr, 'alleles': hl.tstr, 'ld_score': hl.tfloat64 }, entry_type=hl.tstr) mt = mt.key_cols_by(phenotype=mt.col_id) mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus), alleles=mt.alleles.split(',')) mt = mt.drop('row_id', 'col_id') mt = mt.annotate_entries(x=mt.x.split(",")) mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]), n=hl.int32(mt.x[1])) mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score)) doctest_namespace['ld_score_all_phenos_sumstats'] = mt print("finished setting up doctest...")
logreg_pop_file = 'gs://ccdg-qc-multi/data/IBD/logreg_results_' samples_file = 'gs://ccdg-qc-multi/data/IBD/samples_file.tsv' plink_file = 'gs://ccdg-qc-multi/data/IBD/ibdvars' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # read data #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ vds = hl.read_matrix_table(vds_splitmulti_file) vds_immune = hl.read_matrix_table(vds_immune_file) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # filter variants #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ibd_vars = hl.import_locus_intervals(ibdvars_file) intervallist = [x.interval for x in ibd_vars.collect()] manuelvars = hl.import_locus_intervals(manuelvars_file) intervallist = intervallist + [x.interval for x in manuelvars.collect()] vds = hl.filter_intervals(vds, intervallist, keep=True) ##vds = vds.filter_rows(vds.qcpass) vds_immune = hl.filter_intervals(vds_immune, intervallist, keep=True) vds_immune = hl.split_multi_hts( vds_immune.select_entries(vds_immune.GT, vds_immune.AD, vds_immune.DP, vds_immune.GQ, vds_immune.PL)) vds_combined = vds_immune.union_cols(vds) vds_combined = vds_combined.naive_coalesce(1000) vds_combined.write(filtered_vds_combined_file, overwrite=True)
MT_HARDCALLS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/bipolar_wes_dalio_W1_W2/filterGT.hardcalls.mt' # Read in the hard calls matrix table. mt = hl.read_matrix_table(MT_HARDCALLS) # Read in the target intervals TARGET_INTERVALS = 'gs://dalio_bipolar_w1_w2_hail_02/whole_exome_illumina_coding_v1.Homo_sapiens_assembly19.targets.fixed.interval_list' # Low complexity regions in the data. LCRs = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/inputs/inputs_low_complexity_regions_b37.interval_list' INITIAL_VARIANT_QC_FILE = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/02_prefilter_metrics_b37_callset.tsv' INITIAL_VARIANT_LIST = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/02_prefilter_b37_callset.keep.variant_list' INITIAL_VARIANT_AUTO_LIST = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/02_prefilter_b37_callset.keep.autosome.variant_list' # Import the interval lists for the LCRs. target_intervals = hl.import_locus_intervals(TARGET_INTERVALS) LCR_intervals = hl.import_locus_intervals(LCRs) # Annotate variants with flag indicating if they are in LCR or failed VQSR. mt = mt.annotate_rows(fail_VQSR=hl.len(mt.filters) != 0) mt = mt.annotate_rows(in_LCR=hl.is_defined(LCR_intervals[mt.locus])) mt = mt.annotate_rows( not_in_target_intervals=~hl.is_defined(target_intervals[mt.locus])) # Get information about the number of variants that were excluded. fail_VQSR = mt.filter_rows(mt.fail_VQSR).count_rows() in_LCR = mt.filter_rows(mt.in_LCR).count_rows() not_in_target_intervals = mt.filter_rows( mt.not_in_target_intervals).count_rows() print('n variants failing VQSR:')
vds_1kg_file = 'gs://ccdg-qc-multi/data/1000genomes/vds/hail2_ALL.GRCh38.genotypes.20170504.vds' mhc_chr8inv_file = 'gs://ccdg-qc-multi/data/MHC_invchr8_longLDreg_liftover_to_GRCh38.txt' rel_exclusion_file = 'gs://ccdg-qc-multi/out/king/' + chrom + '/ibd_greater_0884_' + chrom + '.txt' samples_to_keep_file = 'gs://ccdg-qc-multi/qc_measures/' + chrom + '/01_sample_qc_keep.txt' # output pca_value_file = 'gs://ccdg-qc-multi/qc_measures/pca/' + chrom + '/pca_values.tsv' pca_score_file = 'gs://ccdg-qc-multi/qc_measures/pca/' + chrom + '/pca_scores.tsv' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # read data #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## interval list #mhc_chr8inv = hl.import_table(mhc_chr8inv_file, no_header=True).key_by('f0') mhc_chr8inv = hl.import_locus_intervals(mhc_chr8inv_file) ## rel_exclusion = hl.import_table(rel_exclusion_file, no_header=True).key_by('f0') vds = hl.read_matrix_table(vds_ldpruned_common_file) onekg = hl.read_matrix_table(vds_1kg_file) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # rename samples #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print('annotate 1KG...') onekg = onekg.annotate_cols(s='1KG_' + onekg.s)
HIGH_LD_INTERVALS = 'gs://raw_data_bipolar_dalio_w1_w2/inputs/b38_high_ld.bed' INITIAL_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/03_initial_qc.keep.sample_list' INITIAL_VARIANT_LIST = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/02_prefilter.keep.variant_list' ht_initial_samples = hl.import_table(INITIAL_SAMPLES, no_header=True, key='f0') ht_initial_variants = hl.import_table(INITIAL_VARIANT_LIST, types={ 'locus': hl.tlocus(reference_genome='GRCh38'), 'alleles': hl.tarray(hl.tstr) }) ht_initial_variants = ht_initial_variants.key_by(ht_initial_variants.locus, ht_initial_variants.alleles) high_LD_intervals = hl.import_locus_intervals(HIGH_LD_INTERVALS, reference_genome='GRCh38') mt = hl.read_matrix_table(MT_HARDCALLS) mt = mt.filter_cols(hl.is_defined(ht_initial_samples[mt.col_key])) mt = mt.annotate_rows(in_high_LD=hl.is_defined(high_LD_intervals[mt.locus])) mt = mt.filter_rows( hl.is_defined(ht_initial_variants[mt.row_key]) & (~mt.in_high_LD)) mt = mt.filter_rows(mt.locus.in_x_nonpar() | mt.locus.in_autosome_or_par()) mt = hl.variant_qc(mt, name='qc') mt = mt.filter_rows((mt.qc.AF[0] > 0.01) & (mt.qc.AF[0] < 0.99) & ((mt.qc.call_rate > 0.98) | mt.locus.in_x_nonpar() | mt.locus.in_x_par())).persist() mt.count()