def test_annotate_intervals(self): ds = get_dataset() bed1 = hl.import_bed(resource('example1.bed'), reference_genome='GRCh37') bed2 = hl.import_bed(resource('example2.bed'), reference_genome='GRCh37') bed3 = hl.import_bed(resource('example3.bed'), reference_genome='GRCh37') self.assertTrue(list(bed2.key.dtype) == ['interval']) self.assertTrue(list(bed2.row.dtype) == ['interval', 'target']) interval_list1 = hl.import_locus_intervals(resource('exampleAnnotation1.interval_list')) interval_list2 = hl.import_locus_intervals(resource('exampleAnnotation2.interval_list')) self.assertTrue(list(interval_list2.key.dtype) == ['interval']) self.assertTrue(list(interval_list2.row.dtype) == ['interval', 'target']) ann = ds.annotate_rows(in_interval=bed1[ds.locus]).rows() self.assertTrue(ann.all((ann.locus.position <= 14000000) | (ann.locus.position >= 17000000) | (hl.is_missing(ann.in_interval)))) for bed in [bed2, bed3]: ann = ds.annotate_rows(target=bed[ds.locus].target).rows() expr = (hl.case() .when(ann.locus.position <= 14000000, ann.target == 'gene1') .when(ann.locus.position >= 17000000, ann.target == 'gene2') .default(ann.target == hl.null(hl.tstr))) self.assertTrue(ann.all(expr)) self.assertTrue(ds.annotate_rows(in_interval=interval_list1[ds.locus]).rows() ._same(ds.annotate_rows(in_interval=bed1[ds.locus]).rows())) self.assertTrue(ds.annotate_rows(target=interval_list2[ds.locus].target).rows() ._same(ds.annotate_rows(target=bed2[ds.locus].target).rows()))
def test_import_bed_badly_defined_intervals(self): bed_file = resource('example4.bed') t = hl.import_bed(bed_file, reference_genome='GRCh37', skip_invalid_intervals=True) self.assertTrue(t.count() == 3) t = hl.import_bed(bed_file, reference_genome=None, skip_invalid_intervals=True) self.assertTrue(t.count() == 4)
def get_cnt_matrix(mnv_table, region="ALL", dist=1, minimum_cnt=0, PASS=True, part_size=1000, hom=False): # mnv_table = hail table of mnvs # region = bed file, defining the regions of interest (e.g. enhancer region) # dist = distance between two SNPs # PASS=True: restrict to both pass variants # we don't care indels anymore # filter by region, if you give a bed file path as region if region != "ALL": bed = hl.import_bed(region) mnv_table = mnv_table.filter(hl.is_defined(bed[mnv_table.locus])) if PASS=="NO":#exclusively getting at least one non-pass ones mnv_table = mnv_table.filter((mnv_table.filters.length() > 0) | (mnv_table.prev_filters.length() > 0)) elif PASS==True: mnv_table = mnv_table.filter((mnv_table.filters.length() == 0) & (mnv_table.prev_filters.length() == 0)) if hom: mnv_table = mnv_table.filter(mnv_table.n_homhom>0) # count MNV occurance -- restricting to SNPs mnv = mnv_table.filter((mnv_table.alleles[0].length() == 1) & (mnv_table.alleles[1].length() == 1) & (mnv_table.prev_alleles[0].length() == 1) & (mnv_table.prev_alleles[1].length() == 1) & (( mnv_table.locus.position - mnv_table.prev_locus.position) == dist)) # filter to that specific distance #repartition to proper size mnv = mnv.repartition(part_size) mnv_cnt = mnv.group_by("alleles", "prev_alleles").aggregate(cnt=agg.count()) # count occurance mnv_cnt = mnv_cnt.annotate( refs=mnv_cnt.prev_alleles[0] + "N" * (dist - 1) + mnv_cnt.alleles[0]) # annotate combined refs mnv_cnt = mnv_cnt.annotate( alts=mnv_cnt.prev_alleles[1] + "N" * (dist - 1) + mnv_cnt.alleles[1]) # annotate combined alts if minimum_cnt > 0: mnv_cnt = mnv_cnt.filter((mnv_cnt.cnt > minimum_cnt)) # remove trivial ones return (mnv_cnt.select("refs", "alts", "cnt"))
def get_cnt_matrix_alldist(mnv_table, region="ALL", dist_min=1, dist_max=10, minimum_cnt=0, PASS=True, part_size=1000): #give a distance range, instead of single distance if region != "ALL": bed = hl.import_bed(region, skip_invalid_intervals=True) mnv_table = mnv_table.filter(hl.is_defined(bed[mnv_table.locus])) if PASS: mnv_table = mnv_table.filter((mnv_table.filters.length() == 0) & (mnv_table.prev_filters.length() == 0)) # count MNV occurance -- restricting to SNPs mnv_table = mnv_table.filter((mnv_table.alleles[0].length() == 1) & (mnv_table.alleles[1].length() == 1) & (mnv_table.prev_alleles[0].length() == 1) & (mnv_table.prev_alleles[1].length() == 1) ) pdall = {} for dist in range(dist_min, (dist_max+1)): mnv = mnv_table.filter((mnv_table.locus.position - mnv_table.prev_locus.position) == dist) # filter to that specific distance #repartition to proper size mnv = mnv.repartition(part_size) mnv_cnt = mnv.group_by("alleles", "prev_alleles").aggregate(cnt=agg.count()) # count occurance mnv_cnt = mnv_cnt.annotate( refs=mnv_cnt.prev_alleles[0] + "N" * (dist - 1) + mnv_cnt.alleles[0]) # annotate combined refs mnv_cnt = mnv_cnt.annotate( alts=mnv_cnt.prev_alleles[1] + "N" * (dist - 1) + mnv_cnt.alleles[1]) # annotate combined alts if minimum_cnt > 0: mnv_cnt = mnv_cnt.filter((mnv_cnt.cnt > minimum_cnt)) # remove trivial ones pdall[dist] = ht_cnt_mat_to_pd(mnv_cnt.select("refs", "alts", "cnt")) #saving as pandas dataframe, in dictionary print ("done d={0}".format(dist)) print(tm.ctime()) return (pdall) #returning a dictionary of dataframe
def test_annotate_intervals(self): ds = get_dataset() bed1 = hl.import_bed(resource('example1.bed'), reference_genome='GRCh37') bed2 = hl.import_bed(resource('example2.bed'), reference_genome='GRCh37') bed3 = hl.import_bed(resource('example3.bed'), reference_genome='GRCh37') self.assertTrue(list(bed2.key.dtype) == ['interval']) self.assertTrue(list(bed2.row.dtype) == ['interval', 'target']) interval_list1 = hl.import_locus_intervals( resource('exampleAnnotation1.interval_list')) interval_list2 = hl.import_locus_intervals( resource('exampleAnnotation2.interval_list')) self.assertTrue(list(interval_list2.key.dtype) == ['interval']) self.assertTrue( list(interval_list2.row.dtype) == ['interval', 'target']) ann = ds.annotate_rows(in_interval=bed1[ds.locus]).rows() self.assertTrue( ann.all((ann.locus.position <= 14000000) | (ann.locus.position >= 17000000) | (hl.is_missing(ann.in_interval)))) for bed in [bed2, bed3]: ann = ds.annotate_rows(target=bed[ds.locus].target).rows() expr = (hl.case().when(ann.locus.position <= 14000000, ann.target == 'gene1').when( ann.locus.position >= 17000000, ann.target == 'gene2').default( ann.target == hl.null(hl.tstr))) self.assertTrue(ann.all(expr)) self.assertTrue( ds.annotate_rows( in_interval=interval_list1[ds.locus]).rows()._same( ds.annotate_rows(in_interval=bed1[ds.locus]).rows())) self.assertTrue( ds.annotate_rows( target=interval_list2[ds.locus].target).rows()._same( ds.annotate_rows(target=bed2[ds.locus].target).rows()))
def overlap_with_file(mt: hl.MatrixTable, bed) -> hl.MatrixTable: ''' :param mt: a matrixtable :param bed: the baits file with coordinates for which to filter matrixtable :return: a matrixtable with variants overlapping with baits file only ''' baits = hl.import_bed(bed, reference_genome='GRCh38') overlapping_mt = mt.filter_rows(hl.is_defined(baits[mt.locus])) return overlapping_mt
def get_telomeres_and_centromeres_ht(overwrite: bool = False) -> hl.Table: tc_interval = hl.import_bed( f'{nfs_dir}/resources/grch38/hg38.telomeresAndMergedCentromeres.bed', skip_invalid_intervals=True, min_partitions=10, reference_genome='GRCh38') return tc_interval.checkpoint( f'{nfs_dir}/resources/grch38/hg38.telomeresAndMergedCentromeres.ht', overwrite=overwrite, _read_if_exists=not overwrite)
def get_segdups_ht(overwrite: bool = False) -> hl.Table: segdup_interval = hl.import_bed( f'{nfs_dir}/resources/grch38/GRCh38_segdups.bed', skip_invalid_intervals=True, min_partitions=50, reference_genome='GRCh38') return segdup_interval.checkpoint( f'{nfs_dir}/resources/grch38/GRCh38_segdups.ht', overwrite=overwrite, _read_if_exists=not overwrite)
def import_intervals_from_bed(bed_path: str, platform_label: str, genome_ref: str) -> hl.Table: """ Handle importing BED files as intervals. Recode contig if necessary and annotate global meta-info. Note: `platform_label` and `genome_ref` are required, since these info will be used as global annotations. :param bed_path: Path to capture interval BED file :param platform_label: Unique capture interval identifier (e.g. 'ssv3') :param genome_ref: Either 'GRCh37' or 'GRCh38 :return: HailTable keyed by interval """ # genome references rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') # dict contig recode from rg38 -> rg37. # only autosomal and sex chromosomes CONTIG_RECODING_HG38_TO_HG37 = { contig: contig.replace('chr', '') for contig in rg38.contigs[:24] } # dict contig recode from rg37 -> rg38. # only autosomal and sex chromosomes CONTIG_RECODING_HG37_TO_HG38 = { CONTIG_RECODING_HG38_TO_HG37.get(k): k for k in CONTIG_RECODING_HG38_TO_HG37.keys() } # Recode contig if chromosome field in BED file miss-match with genome reference. if genome_ref == 'GRCh37': contig_recoding = CONTIG_RECODING_HG38_TO_HG37 elif genome_ref == 'GRCh38': contig_recoding = CONTIG_RECODING_HG37_TO_HG38 else: contig_recoding = None ht_intervals = hl.import_bed(bed_path, reference_genome=genome_ref, contig_recoding=contig_recoding) global_ann_expr = dict( zip(GLOBAL_ANNOTATION_FIELDS, (current_date(), bed_path, genome_ref, platform_label))) ht_intervals = (ht_intervals.annotate_globals( **global_ann_expr).key_by('interval').repartition(100)) return ht_intervals
def test_import_bed(self): bed_file = resource('example1.bed') bed = hl.import_bed(bed_file, reference_genome='GRCh37') nbed = bed.count() i = 0 with open(bed_file) as f: for line in f: if len(line.strip()) != 0: try: int(line.split()[0]) i += 1 except: pass self.assertEqual(nbed, i) self.assertEqual(bed.interval.dtype.point_type, hl.tlocus('GRCh37')) bed_file = resource('example2.bed') t = hl.import_bed(bed_file, reference_genome='GRCh37') self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37')) self.assertTrue(list(t.key.dtype) == ['interval']) self.assertTrue(list(t.row.dtype) == ['interval','target'])
def test_import_bed(self): bed_file = resource('example1.bed') bed = hl.import_bed(bed_file, reference_genome='GRCh37') nbed = bed.count() i = 0 with open(bed_file) as f: for line in f: if len(line.strip()) != 0: try: int(line.split()[0]) i += 1 except: pass self.assertEqual(nbed, i) self.assertEqual(bed.interval.dtype.point_type, hl.tlocus('GRCh37')) bed_file = resource('example2.bed') t = hl.import_bed(bed_file, reference_genome='GRCh37') self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37')) self.assertTrue(list(t.key.dtype) == ['interval']) self.assertTrue(list(t.row.dtype) == ['interval', 'target'])
def filter_low_conf_regions( mt: hl.MatrixTable, filter_lcr: bool = True, filter_decoy: bool = True, filter_segdup: bool = True, high_conf_regions: Optional[List[str]] = None) -> hl.MatrixTable: """ Filters low-confidence regions :param MatrixTable mt: MT to filter :param bool filter_lcr: Whether to filter LCR regions :param bool filter_decoy: Whether to filter decoy regions :param bool filter_segdup: Whether to filter Segdup regions :param list of str high_conf_regions: Paths to set of high confidence regions to restrict to (union of regions) :return: MT with low confidence regions removed :rtype: MatrixTable """ from gnomad_hail.resources import lcr_intervals_path, decoy_intervals_path, segdup_intervals_path if filter_lcr: lcr = hl.import_locus_intervals(lcr_intervals_path) mt = mt.filter_rows(hl.is_defined(lcr[mt.locus]), keep=False) if filter_decoy: decoy = hl.import_bed(decoy_intervals_path) mt = mt.filter_rows(hl.is_defined(decoy[mt.locus]), keep=False) if filter_segdup: segdup = hl.import_bed(segdup_intervals_path) mt = mt.filter_rows(hl.is_defined(segdup[mt.locus]), keep=False) if high_conf_regions is not None: for region in high_conf_regions: region = hl.import_locus_intervals(region) mt = mt.filter_rows(hl.is_defined(region[mt.locus]), keep=True) return mt
if __name__ == "__main__": # need to create spark cluster first before intiialising hail sc = pyspark.SparkContext() # Define the hail persistent storage directory tmp_dir = "hdfs://spark-master:9820/" temp_dir = os.path.join(os.environ["HAIL_HOME"], "tmp") hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38") # s3 credentials required for user to access the datasets in farm flexible compute s3 environment # you may use your own here from your .s3fg file in your home directory hadoop_config = sc._jsc.hadoopConfiguration() hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"]) hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"]) bed_to_exclude_pca = hl.import_bed( f"{temp_dir}/1000g/price_high_ld.bed.txt", reference_genome='GRCh38') cohorts_pop = hl.import_table( "s3a://DDD-ELGH-UKBB-exomes/ancestry/sanger_cohort_known_populations_ukbb.tsv", delimiter="\t").key_by('s') mt = hl.read_matrix_table( f"{temp_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_pca_scores.mt") # mt = mt.annotate_cols( # loadings=pca_loadings[mt_vqc_filtered.col_key].loadings) # mt = mt.annotate_cols(known_pop="unk") # pca_scores = pca_scores.annotate(known_pop="unk") pca_scores = hl.read_table( f"{temp_dir}/ddd-elgh-ukbb/pca_scores_known_pop.ht") pca_loadings = hl.read_table(f"{temp_dir}/ddd-elgh-ukbb/pca_loadings.ht") logger.info("assign population pcs") # population_assignment_table = assign_population_pcs(
variant_list_file = 'gs://rcstorage/qced/' + chrom + '/qced_' + chrom + '_variant_list.txt' # define output files sample_qc_info_postqc_file = 'gs://rcstorage/qced/' + chrom + '/sample_qc_info_postqc_revisegt.txt' print("importing vds files...") vds = hl.read_matrix_table(vds_splitmulti_file) num0 = vds.count() print(num0) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # II. Remove LCR #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("removing lcr...") lcr = hl.import_bed(lcr_file, reference_genome='GRCh38') vds = vds.filter_rows(hl.is_defined(lcr[vds.locus]), keep=False) num1 = vds.count() print(num1) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # III. Annotate variants with PASS or FAIL #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("annotating variants...") variant_list = hl.import_table(variant_list_file) variant_list_post = variant_list.add_index() variant_list_post = variant_list_post.key_by('idx') vds_post = vds.add_row_index()
def test_import_bed_no_reference_specified(self): bed_file = resource('example1.bed') t = hl.import_bed(bed_file, reference_genome=None) self.assertTrue(t.count() == 3) self.assertEqual(t.interval.dtype.point_type, hl.tstruct(contig=hl.tstr, position=hl.tint32))
'locus').distinct_by_row().key_rows_by('locus', 'alleles') mt_split = hl.split_multi_hts(mt_annotated, keep_star=False, left_aligned=False) mt = mt_split.annotate_rows(Variant_Type=hl.cond( (hl.is_snp(mt_split.alleles[0], mt_split.alleles[1])), "SNP", hl.cond( hl.is_insertion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL", hl.cond(hl.is_deletion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL", "Other")))) mt = mt.checkpoint( f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-split-multi_cohorts.mt", overwrite=True) print("Finished splitting and writing mt. ") agilent_table = hl.import_bed(agilent, reference_genome='GRCh38') mt_agilent = mt.filter_rows(hl.is_defined(agilent_table[mt.locus])) mt_agilent = hl.sample_qc(mt_agilent, name='sample_QC_Hail') pandadf1 = mt_agilent.cols().flatten() print("Outputting table of sample qc") pandadf1.export( f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}_agilent_sampleQC.tsv.bgz", header=True) mt = mt.checkpoint( f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-sampleqc-unfiltered_annotated.mt", overwrite=True)
types={ "Locus": "locus<GRCh38>", "VQSLOD": hl.tfloat64 }) VQSLOD_indels = hl.import_table( f'{lustre_dir}/intervalwgs-qc/VQSLOD_indels.bgz', types={ "Locus": "locus<GRCh38>", "VQSLOD": hl.tfloat64 }) sample_QC_nonHail = hl.import_table( f'{lustre_dir}/intervalwgs-qc/INTERVAL_WGS_Sample_QC_04-09-2019.txt', impute=True) centromere_table = hl.import_bed( f'{lustre_dir}/intervalwgs-qc/Centromere_region_UCSC_GRCh38.bed', reference_genome='GRCh38', min_partitions=250) ##################################################################### ###################### INPUT DATA ############################## ##################################################################### # Give chromosome as input to program with chr prefix i.e chr1, chr2, chr3 etc CHROMOSOME = "chr1" print(f"Reading {CHROMOSOME} mt") mt = hl.read_matrix_table(f'{lustre_dir}/chr1.mt') #mt = hl.read_matrix_table(f"{temp_dir}/matrixtables/{CHROMOSOME}.mt") print("Splitting mt and writing out split mt") mt_split = hl.split_multi_hts(mt, keep_star=False, left_aligned=False)
def get_baselevel_expression_for_genes( mt, gtex, gene_list=None, get_proportions=None, gene_maximums_ht_path=gtex_v7_gene_maximums_ht_path): gtex_table = gtex.key_by("transcript_id") if gene_list: genes = hl.literal(gene_list) # Filter context_ht to genes of interest mt = mt.annotate_rows(in_gene_of_interest=genes.find( lambda x: mt.vep.transcript_consequences.any(lambda tc: tc. gene_symbol == x))) mt = mt.filter_rows(mt.in_gene_of_interest != "NA") # Need to modify process consequences to ignore splice variants, because these can occur on intronic regions all_coding_minus_splice = list( set(all_coding_csqs) - set([ 'splice_acceptor_variant', 'splice_donor_variant', 'splice_region_variant' ])) def add_most_severe_consequence_to_consequence_minus_splice( tc: hl.expr.StructExpression) -> hl.expr.StructExpression: """ Copied from gnomad_hail but slight change """ csqs = hl.literal(all_coding_minus_splice) return tc.annotate(most_severe_consequence=csqs.find( lambda c: tc.consequence_terms.contains(c))) # Add worst consequence within transcript consequences mt = (mt.annotate_rows(vep=mt.vep.annotate( transcript_consequences=mt.vep.transcript_consequences.map( add_most_severe_consequence_to_consequence_minus_splice)))) # Explode on transcript consequences mt = mt.explode_rows(mt.vep.transcript_consequences) mt_kt = mt.rows() # Filter to positions in the CDS regions cds_intervals = hl.import_bed( "gs://gnomad-public/papers/2019-tx-annotation/data/other_data/gencode.v19.CDS.Hail.021519.bed" ) mt_kt = mt_kt.annotate(in_cds=hl.is_defined(cds_intervals[mt_kt.locus])) mt_kt = mt_kt.filter(mt_kt.in_cds) # Filter to protein coding transcripts only mt_kt = mt_kt.filter( mt_kt.vep.transcript_consequences.biotype == "protein_coding") # Filter to coding variants to only evalute those effects mt_kt = filter_table_to_csqs(mt_kt, all_coding_minus_splice) # To avoid double counting transcripts at a given base, key by transcript and position and dedup mt_kt = mt_kt.key_by(mt_kt.locus, mt_kt.vep.transcript_consequences.transcript_id) mt_kt = mt_kt.distinct() # Annotate mt with the gtex values (ie. join them) mt_kt = mt_kt.annotate( tx_data=gtex_table[mt_kt.vep.transcript_consequences.transcript_id]) ## Group by gene, symbol and position ht_sum_of_bases = mt_kt.group_by( locus=mt_kt.locus, ensg=mt_kt.vep.transcript_consequences.gene_id, symbol=mt_kt.vep.transcript_consequences.gene_symbol).aggregate( sum_per_base=hl.agg.array_sum(mt_kt.tx_data.agg_expression)) tissue_ids = sorted([ y.tissue.replace("-", "_").replace(" ", "_").replace("(", "_").replace(")", "_") for y in gtex.values.take(1)[0] ]) d = {tiss: i for i, tiss in enumerate(tissue_ids)} ht_sum_of_bases = ht_sum_of_bases.annotate(**{ tissue: ht_sum_of_bases.sum_per_base[d[tissue]] for tissue in tissue_ids }) if get_proportions: gene_maximums_ht = hl.read_table(gene_maximums_ht_path) ht_sum_of_bases = ht_sum_of_bases.key_by(ht_sum_of_bases.locus) ht_sum_of_bases = ht_sum_of_bases.annotate(alleles="filler") ht_sum_of_bases = get_expression_proportion( tx_table=ht_sum_of_bases, tissues_to_filter=["sum_per_base"], gene_maximum_ht=gene_maximums_ht) ht_sum_of_bases = ht_sum_of_bases.key_by(ht_sum_of_bases.locus) ht_sum_of_bases = ht_sum_of_bases.drop(ht_sum_of_bases.alleles) return ht_sum_of_bases
mt = mt_split.annotate_rows( Variant_Type=hl.cond((hl.is_snp(mt_split.alleles[0], mt_split.alleles[1])), "SNP", hl.cond( hl.is_insertion( mt_split.alleles[0], mt_split.alleles[1]), "INDEL", hl.cond(hl.is_deletion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL", "Other")))) mt = mt.checkpoint( f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-split-multi_cohorts.mt", overwrite=True) print("Finished splitting and writing mt. ") intersection_table = hl.import_bed( intersection_bed, reference_genome='GRCh38') union_table = hl.import_bed(union_bed, reference_genome='GRCh38') mt_intersection = mt.filter_rows( hl.is_defined(intersection_table[mt.locus])) mt_union = mt.filter_rows(hl.is_defined(union_table[mt.locus])) mt_intersection = hl.sample_qc(mt_intersection, name='sample_QC_Hail') pandadf1 = mt_intersection.cols().flatten() print("Outputting table of sample qc") pandadf1.export( f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}_intersection_BED_sampleQC.tsv.bgz", header=True) mt_intersection = mt_intersection.checkpoint( f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-intersection_BED.mt", overwrite=True)
impute=True, types={ 'f0': hl.tstr }).key_by('f0')) vds = vds.annotate_cols(**table[vds.s]) # import covar # dic = {} # for i in np.arange(1, 41): # dic['pc' + str(i)] = hl.tfloat # pcas = hl.import_table('gs://ukb_testdata/data/covar.txt', delimiter=' ', types=dic).key_by('FID') # pcas = pcas.drop('IID') # vds = vds.annotate_cols(**pcas[vds.s]) # vds.select_cols('f1') print("current time is: ", time.asctime(time.localtime(time.time()))) bed = hl.import_bed('gs://ukb_testdata/data/Berisa.EUR.hg19_modif.bed') print("current time is: ", time.asctime(time.localtime(time.time()))) vds = vds.annotate_rows(LD_block=bed[vds.locus].target) gts_as_rows = vds.annotate_rows( mean=hl.agg.mean(hl.float(vds.GT.n_alt_alleles())), genotypes=hl.agg.collect(hl.float(vds.GT.n_alt_alleles())), phenotypes=hl.agg.collect(hl.float(vds.f1))).rows() groups = gts_as_rows.group_by(ld_block=gts_as_rows.LD_block).aggregate( genotypes=hl.agg.collect(gts_as_rows.genotypes), ys=hl.agg.collect(gts_as_rows.phenotypes)) df = groups.to_spark()
def test_import_bed_no_reference_specified(self): bed_file = resource('example1.bed') t = hl.import_bed(bed_file, reference_genome=None) self.assertEqual(t.interval.dtype.point_type, hl.tstruct(contig=hl.tstr, position=hl.tint32))
def main(args): bed_to_exclude_pca = hl.import_bed(locations_exclude_from_pca, reference_genome='GRCh38') cohorts_pop = hl.import_table(cohorts_populations, delimiter="\t").key_by('s') # # overlap AKT dataset overlap_1kg_AKT = hl.import_matrix_table(AKT_overlap) # drop cohorts # annotate with cohorts and populations from s3 table. # save matrixtable mt = hl.read_matrix_table(args.matrixtable) mt = mt.annotate_cols(cohort=cohorts_pop[mt.s].cohort) mt = mt.annotate_cols(original_pop=cohorts_pop[mt.s].known_population) mt = mt.annotate_cols(known_pop=cohorts_pop[mt.s].known_population_updated) # mt = mt.annotate_cols(superpopulation=cohorts_pop[mt.s].superpopulation) mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID) mt.write( f"{args.output_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_pops.mt", overwrite=True) # filter matrixtable logger.info("wrote mt ") # filter mt mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1])) mt = mt.filter_rows(~hl.is_mnp(mt.alleles[0], mt.alleles[1])) mt = mt.filter_rows(~hl.is_indel(mt.alleles[0], mt.alleles[1])) mt = mt.filter_rows(~hl.is_complex(mt.alleles[0], mt.alleles[1])) mt_vqc = hl.variant_qc(mt, name='variant_QC_Hail') # (mt_vqc.variant_QC_Hail.p_value_hwe >= 10 ** -6) & not to use this according to hcm. mt_vqc_filtered = mt_vqc.filter_rows( (mt_vqc.variant_QC_Hail.call_rate >= 0.99) & (mt_vqc.variant_QC_Hail.AF[1] >= 0.05) & (mt_vqc.variant_QC_Hail.AF[1] <= 0.95)) mt_vqc_filtered = mt_vqc_filtered.filter_rows(hl.is_defined( bed_to_exclude_pca[mt_vqc_filtered.locus]), keep=False) # overlap AKT dataset: # overlap_1kg_AKT # mt_1kg_chr1_chr20 = hl.read_matrix_table( # f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ancestry_work/1000g_chr1_20_AKT_overlap.mt") overlap_1kg_AKT = overlap_1kg_AKT.key_rows_by("locus") mt_vqc_filtered = mt_vqc_filtered.filter_rows( hl.is_defined(overlap_1kg_AKT.rows()[mt_vqc_filtered.locus])) logger.info("done filtering writing mt") # ld pruning pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.2, bp_window_size=500000) #pruned_ht = hl.ld_prune(mt.GT, r2=0.1) pruned_mt = mt_vqc_filtered.filter_rows( hl.is_defined(pruned_ht[mt_vqc_filtered.row_key])) # remove pruned areas that need to be removed # autosomes only: pruned_mt = pruned_mt.filter_rows(pruned_mt.locus.in_autosome()) pruned_mt.write( f"{args.output_dir}/ddd-elgh-ukbb/chr1_chr20_ldpruned_updated.mt", overwrite=True) # pruned_mt = hl.read_matrix_table( # f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt") # related_samples_to_drop = hl.read_table( # f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_related_samples_to_remove.ht") logger.info("run_pca_with_relateds") # pca_evals, pca_scores, pca_loadings = run_pca_with_relateds( # pruned_mt, related_samples_to_drop, autosomes_only=True) pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( pruned_mt.GT, k=10, compute_loadings=True) pca_scores = pca_scores.annotate( known_pop=pruned_mt.cols()[pca_scores.s].known_pop) # mt = mt.annotate_cols( # loadings=pca_loadings[mt_vqc_filtered.col_key].loadings) # mt = mt.annotate_cols(known_pop="unk") # pca_scores = pca_scores.annotate(known_pop="unk") pca_scores.write( f"{args.output_dir}/ddd-elgh-ukbb/pca_scores_after_pruning.ht", overwrite=True) pca_loadings.write( f"{args.output_dir}/ddd-elgh-ukbb/pca_loadings_after_pruning.ht", overwrite=True) with open(f"{args.output_dir}/ddd-elgh-ukbb/pca_evals_after_pruning.txt", 'w') as f: for val in pca_evals: f.write(str(val)) logger.info("assign population pcs") pop_ht, pop_clf = assign_population_pcs(pca_scores, pca_scores.scores, known_col="known_pop", n_estimators=100, prop_train=0.8, min_prob=0.5) pop_ht.write(f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.ht", overwrite=True) pop_ht.export( f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.tsv.gz")