def test_liftover_strand(self): grch37 = hl.get_reference('GRCh37') grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') self.assertEqual( hl.eval( hl.liftover(hl.locus('20', 60001, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval( hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'), is_negative_strand=False))) self.assertEqual( hl.eval( hl.liftover(hl.locus_interval('20', 37007582, 37007586, True, True, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval( hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121, True, True, 'GRCh38'), is_negative_strand=True))) with self.assertRaises(FatalError): hl.eval( hl.liftover( hl.parse_locus_interval('1:10000-10000', reference_genome='GRCh37'), 'GRCh38')) grch37.remove_liftover("GRCh38")
def test_reference_genome_liftover(self): grch37 = hl.get_reference('GRCh37') grch38 = hl.get_reference('GRCh38') self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37')) grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37') assert grch37.has_liftover('GRCh38') assert grch38.has_liftover('GRCh37') ds = hl.import_vcf(resource('sample.vcf')) t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows() assert t.all(t.locus == t.liftover) null_locus = hl.null(hl.tlocus('GRCh38')) rows = [ {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')}, {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')}, {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')}, {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')}, {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')}, {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')}, {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus} ] schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38)) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38), hl.liftover(t.l37, 'GRCh38') == t.l38, hl.is_missing(hl.liftover(t.l37, 'GRCh38'))))) t = t.filter(hl.is_defined(t.l38)) self.assertTrue(t.count() == 6) t = t.key_by('l38') t.count() self.assertTrue(list(t.key) == ['l38']) null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38'))) rows = [ {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval}, {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'), 'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')} ] schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38))) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38)) grch37.remove_liftover("GRCh38") grch38.remove_liftover("GRCh37")
def test_reference_genome_liftover(self): grch37 = hl.get_reference('GRCh37') grch38 = hl.get_reference('GRCh38') self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37')) grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37') self.assertTrue(grch37.has_liftover('GRCh38') and grch38.has_liftover('GRCh37')) ds = hl.import_vcf(resource('sample.vcf')) t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows() self.assertTrue(t.all(t.locus == t.liftover)) null_locus = hl.null(hl.tlocus('GRCh38')) rows = [ {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')}, {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')}, {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')}, {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')}, {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')}, {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')}, {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus} ] schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38)) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38), hl.liftover(t.l37, 'GRCh38') == t.l38, hl.is_missing(hl.liftover(t.l37, 'GRCh38'))))) t = t.filter(hl.is_defined(t.l38)) self.assertTrue(t.count() == 6) t = t.key_by('l38') t.count() self.assertTrue(list(t.key) == ['l38']) null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38'))) rows = [ {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval}, {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'), 'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')} ] schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38))) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38)) grch37.remove_liftover("GRCh38") grch38.remove_liftover("GRCh37")
def test_liftover_strand(self): grch37 = hl.get_reference('GRCh37') grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') self.assertEqual(hl.eval(hl.liftover(hl.locus('20', 60001, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval(hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'), is_negative_strand=False))) self.assertEqual(hl.eval(hl.liftover(hl.locus_interval('20', 37007582, 37007586, True, True, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval(hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121, True, True, 'GRCh38'), is_negative_strand=True))) grch37.remove_liftover("GRCh38")
def _import_purcell_5k(path) -> hl.Table: p5k = hl.import_locus_intervals(path, reference_genome='GRCh37') rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') if not rg37.has_liftover('GRCh38'): rg37.add_liftover( 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) p5k = p5k.annotate(start=hl.liftover(p5k.interval.start, 'GRCh38'), end=hl.liftover(p5k.interval.start, 'GRCh38')) p5k = p5k.filter((p5k.start.contig == 'chr' + p5k.interval.start.contig) & (p5k.end.contig == 'chr' + p5k.interval.end.contig)) p5k = p5k.key_by() p5k = p5k.select(locus=p5k.start, locus_b37=p5k.interval.start) return p5k.key_by('locus')
def _import_purcell_5k(path) -> hl.Table: p5k = hl.import_locus_intervals(path, reference_genome="GRCh37") rg37 = hl.get_reference("GRCh37") rg38 = hl.get_reference("GRCh38") if not rg37.has_liftover("GRCh38"): rg37.add_liftover("references/grch37_to_grch38.over.chain.gz", rg38) p5k = p5k.annotate( start=hl.liftover(p5k.interval.start, "GRCh38"), end=hl.liftover(p5k.interval.start, "GRCh38"), ) p5k = p5k.filter((p5k.start.contig == "chr" + p5k.interval.start.contig) & (p5k.end.contig == "chr" + p5k.interval.end.contig)) p5k = p5k.key_by() p5k = p5k.select(locus=p5k.start, locus_b37=p5k.interval.start) return p5k.key_by("locus")
def get_liftover_v2_qc_mt(data_type: str, ld_pruned: bool, release_only: bool = False, overwrite: bool = False) -> hl.MatrixTable: """ Returns MatrixTable for sample QC purposes on build 38: can be exomes, genomes, or joint (joint dataset can also be ld_pruned=True) Criteria: callrate > 0.99, AF > 0.001, SNPs only, bi-allelics only Note: sites where the locus changes chromosome are discarded """ path = qc_mt_path(data_type, ld_pruned, 'GRCh38') if not overwrite and hl.hadoop_exists(path): grch38_qc_mt = hl.read_matrix_table(path) else: grch38_qc_mt = hl.read_matrix_table( qc_mt_path(data_type, ld_pruned=ld_pruned)) get_liftover_genome(grch38_qc_mt) grch38_qc_mt = grch38_qc_mt.key_rows_by() grch38_qc_mt = grch38_qc_mt.transmute_rows(locus=hl.liftover( grch38_qc_mt.locus, 'GRCh38'), locus37=grch38_qc_mt.locus) grch38_qc_mt = grch38_qc_mt.filter_rows( grch38_qc_mt.locus.contig == 'chr' + grch38_qc_mt.locus37.contig) grch38_qc_mt = grch38_qc_mt.key_rows_by(locus=grch38_qc_mt.locus, alleles=grch38_qc_mt.alleles) grch38_qc_mt = grch38_qc_mt.checkpoint(path, overwrite=overwrite) if release_only: meta = get_gnomad_meta(data_type) grch38_qc_mt = grch38_qc_mt.filter_cols( meta[grch38_qc_mt.col_key].release) return grch38_qc_mt
def intersect_target_ref(ref_mt_filt, snp_list, grch37_or_grch38, intersect_out, overwrite: bool = False): mt = hl.read_matrix_table(ref_mt_filt) if grch37_or_grch38.lower() == 'grch38': snp_list = snp_list.key_by(locus=hl.locus(hl.str(snp_list.chr), hl.int(snp_list.pos), reference_genome='GRCh38'), alleles=[snp_list.ref, snp_list.alt]) mt = mt.filter_rows(hl.is_defined(snp_list[mt.row_key])) elif grch37_or_grch38.lower() == 'grch37': snp_list = snp_list.key_by(locus=hl.locus(hl.str(snp_list.chr), hl.int(snp_list.pos), reference_genome='GRCh37'), alleles=[snp_list.ref, snp_list.alt]) # liftover snp list to GRCh38, filter to SNPs in mt rg37, rg38 = load_liftover() snp_liftover = snp_list.annotate( new_locus=hl.liftover(snp_list.locus, 'GRCh38')) snp_liftover = snp_liftover.filter( hl.is_defined(snp_liftover.new_locus)) snp_liftover = snp_liftover.key_by(locus=snp_liftover.new_locus, alleles=snp_liftover.alleles) mt = mt.filter_rows(hl.is_defined(snp_liftover[mt.row_key])) mt = mt.repartition(5000) mt = mt.checkpoint(intersect_out, overwrite=overwrite, _read_if_exists=not overwrite)
def main(args): full_vcf = hl.read_matrix_table(args.allreads_prefix + '.mt') # liftover chains rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover( 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) chips = hl.hadoop_open(args.chip_loci) chip_dict = {} for chip in chips: chip = chip.strip().split() chip_pos = hl.import_table(chip[1], filter='\[Controls\]', skip_blank_lines=True) chip_pos = chip_pos.filter( hl.array(list(map(str, range(1, 23))) + ['X', 'Y']).contains( chip_pos.chr)) chip_pos = chip_pos.key_by( locus=hl.locus(chip_pos.chr, hl.int(chip_pos.pos))) # liftover chip position info chip_pos = chip_pos.annotate( new_locus=hl.liftover(chip_pos.locus, 'GRCh38')) chip_pos = chip_pos.filter(hl.is_defined(chip_pos.new_locus)) chip_pos = chip_pos.key_by(locus=chip_pos.new_locus) # filter full vcf to sites in genotype data geno_vcf = full_vcf.filter_rows(hl.is_defined( chip_pos[full_vcf.locus])) hl.export_vcf( geno_vcf, 'gs://neurogap/high_coverage/NeuroGap_30x_' + chip[0] + '.vcf.bgz')
def get_gnomad_ld_pruned_mt(genome_version="GRCh38"): gnomad_ld_pruned_hg37_path = "gs://gnomad/sample_qc/mt/gnomad.joint.high_callrate_common_biallelic_snps.pruned.mt" gnomad_ld_pruned_hg38_path = "gs://seqr-bw/ref/GRCh38/gnomad_ld_pruned.mt" if genome_version == "GRCh37": return hl.read_matrix_table(gnomad_ld_pruned_hg37_path) elif genome_version == "GRCh38": if not file_exists(gnomad_ld_pruned_hg38_path): grch37 = hl.get_reference('GRCh37') try: grch37.add_liftover("gs://hail-common/references/grch37_to_grch38.over.chain.gz", 'GRCh38') # doesn't like when try to add on chain file more than once except: # in case the lift-over chain file was added previously pass gnomad_ld_pruned_mt = hl.read_matrix_table(gnomad_ld_pruned_hg37_path) gnomad_ld_pruned_mt = gnomad_ld_pruned_mt.annotate_rows(liftover_locus = hl.liftover(gnomad_ld_pruned_mt.locus, 'GRCh38')) gnomad_ld_pruned_mt = gnomad_ld_pruned_mt.filter_rows(hl.is_defined(gnomad_ld_pruned_mt.liftover_locus)) gnomad_ld_pruned_mt = gnomad_ld_pruned_mt.rename({'locus': 'locus_grch37'}).rename({'liftover_locus': 'locus'}) gnomad_ld_pruned_mt = gnomad_ld_pruned_mt.key_rows_by('locus','alleles') gnomad_ld_pruned_mt.write(gnomad_ld_pruned_hg38_path) return hl.read_matrix_table(gnomad_ld_pruned_hg38_path) else: raise ValueError(f"Invalid genome version: {genome_version}")
def liftover_to_grch38(input_type: str = None, dirname: str = None, basename: str = None): lifted_over = f'{dirname}{basename}.liftover.grch38.mt' print('\nLifting over to GRCh38') mt = read_infile(input_type=input_type, dirname=dirname, basename=basename) rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover( 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) mt = mt.annotate_rows(new_locus=hl.liftover(mt.locus, 'GRCh38', include_strand=True), old_locus=mt.locus) mt = mt.filter_rows( hl.is_defined(mt.new_locus) & ~mt.new_locus.is_negative_strand) mt = mt.key_rows_by(locus=mt.new_locus.result, alleles=mt.alleles) print(f'\nWriting out data lifted-over to GRCh38 to: {lifted_over}') mt.write(lifted_over) return hl.read_matrix_table(lifted_over)
def liftover_expr( locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression, destination_ref: hl.ReferenceGenome) -> hl.expr.StructExpression: lifted_over_locus = hl.liftover(locus, destination_ref, include_strand=True) lifted_over_alleles = alleles.map(lambda a: hl.if_else( lifted_over_locus.is_negative_strand, hl.reverse_complement(a), a)) return hl.struct(locus=lifted_over_locus.result, alleles=lifted_over_alleles)
def add_37_coordinates(mt): """Annotates the GRCh38 MT with 37 coordinates using hail's built-in liftover :param mt: MatrixTable from VCF :return: MatrixTable annotated with GRCh37 coordinates """ rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg38.add_liftover( 'gs://hail-common/references/grch38_to_grch37.over.chain.gz', rg37) mt = mt.annotate_rows(rg37_locus=hl.liftover(mt.locus, 'GRCh37')) return mt
def main(): # Parse args args = parse_args() # Prepare liftover rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover(args.chainfile, rg38) # Create my own rg38 with altered names rg38_custom_contigs = [ contig.replace('chr', '') for contig in rg38.contigs ] rg38_custom_lens = {} for contig in rg38.lengths: rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig] rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs, rg38_custom_lens) # Load plink mt = hl.import_plink(bed=args.in_plink + '.bed', bim=args.in_plink + '.bim', fam=args.in_plink + '.fam', reference_genome='GRCh37', min_partitions=args.min_partitions) # # Re-call to remove phasing (required for plink output) # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False)) # Liftover mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38')) # Strip chr from contig name (causes problems with GCTA) mt = mt.annotate_rows( contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', '')) # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom) mt = mt.key_rows_by() mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38, mt.locus_GRCh38.position, reference_genome=rg38_custom)) mt = mt.key_rows_by(mt.locus, mt.alleles) # Remove rows with missing locus (after liftover) mt = mt.filter_rows(hl.is_defined(mt.locus)) # Write plink format hl.export_plink(dataset=mt, output=args.out_plink) return 0
def main(args): covs = ['0.5', '1', '2', '4', '6'] # liftover gencove data rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover( 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) for cov in ['1']: all_vcf = hl.hadoop_open( 'gs://neurogap/high_coverage/gencove/fastq_keep.txt') header = all_vcf.readline().split() header_index = dict(zip(header, list(range(len(header))))) comb = None if args.merge_vcf: for line in all_vcf: # read and merge line = line.split() if line[header_index['depth']] == cov and line[ header_index['gencove_qc']] == 'PASS': vcf = hl.import_vcf(line[header_index['path']], force_bgz=True, reference_genome='GRCh37', min_partitions=100) if comb is None: comb = vcf else: comb = comb.union_cols(vcf) # write out mt #comb = comb.naive_coalesce(5000) comb.write('gs://neurogap/high_coverage/gencove/merge_' + cov + '_hg19.mt', overwrite=args.overwrite) if args.liftover: comb = hl.read_matrix_table( 'gs://neurogap/high_coverage/gencove/merge_' + cov + '_hg19.mt') comb = comb.annotate_rows( new_locus=hl.liftover(comb.locus, 'GRCh38')) comb = comb.filter_rows(hl.is_defined(comb.new_locus)) comb = comb.key_rows_by(locus=comb.new_locus, alleles=comb.alleles) # write out mt comb.write('gs://neurogap/high_coverage/gencove/merge_' + cov + '_grch38.mt', overwrite=args.overwrite)
def lift_data( t: Union[hl.MatrixTable, hl.Table], gnomad: bool, data_type: str, path: str, rg: hl.genetics.ReferenceGenome, overwrite: bool, ) -> Union[hl.MatrixTable, hl.Table]: """ Lifts input Table or MatrixTable from one reference build to another :param t: Table or MatrixTable :param gnomad: Whether data is gnomAD data :param data_type: Data type (exomes or genomes for gnomAD; not used otherwise) :param path: Path to input Table/MatrixTable (if data is not gnomAD data) :param rg: Reference genome :param overwrite: Whether to overwrite data :return: Table or MatrixTablewith liftover annotations """ logger.info("Annotating input with liftover coordinates") liftover_expr = { "new_locus": hl.liftover(t.locus, rg, include_strand=True), "old_locus": t.locus, } t = (t.annotate(**liftover_expr) if isinstance(t, hl.Table) else t.annotate_rows(**liftover_expr)) no_target_expr = hl.agg.count_where(hl.is_missing(t.new_locus)) num_no_target = (t.aggregate(no_target_expr) if isinstance(t, hl.Table) else t.aggregate_rows(no_target_expr)) logger.info(f"Filtering out {num_no_target} sites that failed to liftover") keep_expr = hl.is_defined(t.new_locus) t = t.filter(keep_expr) if isinstance( t, hl.Table) else t.filter_rows(keep_expr) row_key_expr = {"locus": t.new_locus.result, "alleles": t.alleles} t = (t.key_by(**row_key_expr) if isinstance(t, hl.Table) else t.key_rows_by(**row_key_expr)) logger.info("Writing out lifted over data") t = t.checkpoint( get_checkpoint_path(gnomad, data_type, path, isinstance(t, hl.Table)), overwrite=overwrite, ) return t
def liftover_expr( locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression, destination_reference: hl.ReferenceGenome, ) -> hl.expr.StructExpression: """ Generates struct liftover expression. Struct contains: - locus: Liftover coordinates - alleles: Liftover alleles - original_locus: Locus prior to liftover - original_alleles: Alleles prior to liftover - locus_fail_liftover: Whether the locus failed liftover - ref_allele_mismatch: Whether the allele at index 0 of alleles (lifted over reference allele) doesn't match the allele at that position in the destination reference :param locus: Input locus. :param alleles: Input alleles. :param destination_reference: Desired reference genome build for liftover. :return: Struct containing expressions for lifted over locus/alleles as well as original locus/alleles. """ lifted_over_locus = hl.liftover(locus, destination_reference, include_strand=True) lifted_over_alleles = alleles.map( lambda a: hl.if_else( lifted_over_locus.is_negative_strand, hl.reverse_complement(a), a ) ) return hl.struct( new_locus=lifted_over_locus.result, new_alleles=lifted_over_alleles, original_locus=locus, original_alleles=alleles, locus_fail_liftover=hl.is_missing(lifted_over_locus), ref_allele_mismatch=lifted_over_locus.result.sequence_context() != lifted_over_alleles[0], )
def query(output): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') gnomad_loadings_path = f'{output}/gnomad_loadings_90k_liftover.ht' # liftover and get variants ht_gnomad_loadings = hl.read_table(GNOMAD_V2_LOADINGS) rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover( 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) ht_gnomad_loadings_liftover = ht_gnomad_loadings.annotate( liftover=hl.liftover(ht_gnomad_loadings.locus, 'GRCh38', include_strand=False), old_locus=ht_gnomad_loadings.locus, ) ht_gnomad_loadings_liftover = ht_gnomad_loadings_liftover.key_by( locus=ht_gnomad_loadings_liftover.liftover) # save gnomad loadings ht_gnomad_loadings_liftover.write(gnomad_loadings_path, overwrite=True)
def main(): # # Args (local) # chrom = 11 # chain_file = '/Users/em21/Projects/ot_genetics/genetics-sumstats_data/extras/prepare_uk_biobank_gwas_catalog/sitelist/input_data/grch37_to_grch38.over.chain.gz' # in_bgen = 'example_data/ukb_imp_chr{chrom}_v3.example.bgen' # in_sample = 'output/ukb_10k_downsampled.sample' # to_keep_list = 'output/ukb_10k_downsampled.sample_list.tsv' # out_plink = 'output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k' # cores = 1 # Use "*" for all # maf_threshold = 0.001 # Args (server) chrom = sys.argv[1] chain_file = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/grch37_to_grch38.over.chain.gz' in_bgen = '/nfs/users/nfs_e/em21/otcoregen/uk_biobank_data/data/genetics/imputation/ukb_imp_chr{chrom}_v3.bgen' in_sample = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample' to_keep_list = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample_list.tsv' out_plink = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k' cores = sys.argv[2] # Use "*" for all maf_threshold = 0.001 # Set the maximum number of cores hl.init(master="local[{}]".format(cores)) # Prepare liftover rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover(chain_file, rg38) # Create my own rg38 with altered names rg38_custom_contigs = [ contig.replace('chr', '') for contig in rg38.contigs ] rg38_custom_lens = {} for contig in rg38.lengths: rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig] rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs, rg38_custom_lens) print('Processing chromosome {0}'.format(chrom)) # Index bgen if not existing if not hl.hadoop_exists(in_bgen.format(chrom=chrom) + '.idx2'): hl.index_bgen(in_bgen.format(chrom=chrom), contig_recoding={ "01": "1", "02": "2", "03": "3", "04": "4", "05": "5", "06": "6", "07": "7", "08": "8", "09": "9" }, reference_genome='GRCh37') # Load bgen mt = hl.import_bgen(in_bgen.format(chrom=chrom), entry_fields=['GT'], sample_file=in_sample) # Load list samples to keep samples_to_keep = hl.import_table(to_keep_list, no_header=True, impute=False, types={ 'f0': hl.tstr }).key_by('f0') # Downsample to required subset of samples mt = mt.filter_cols(hl.is_defined(samples_to_keep[mt.s])) # Re-call to remove phasing (required for plink output) # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False)) # Filter on MAF mt = hl.variant_qc(mt) mt = mt.annotate_rows(variant_qc=mt.variant_qc.annotate( MAF=hl.min(mt.variant_qc.AF))) mt = mt.filter_rows(mt.variant_qc.MAF >= maf_threshold) # Liftover mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38')) # Strip chr from contig name (causes problems with GCTA) mt = mt.annotate_rows( contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', '')) # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom) mt = mt.key_rows_by() mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38, mt.locus_GRCh38.position, reference_genome=rg38_custom)) mt = mt.key_rows_by(mt.locus, mt.alleles) # Remove rows with missing locus (after liftover) mt = mt.filter_rows(hl.is_defined(mt.locus)) # Write plink format hl.export_plink(dataset=mt, output=out_plink.format(chrom=chrom)) return 0
parser.add_argument('-d', required=True, choices=['scores', 'elements'], help='GERP++ dataset to load.') parser.add_argument('-b', required=True, choices=['GRCh37', 'GRCh38'], help='Reference genome build to load.') args = parser.parse_args() hg19 = hl.ReferenceGenome.from_fasta_file('hg19', 'gs://hail-datasets-extracted-data/assemblies/ucsc.hg19.fasta.gz', 'gs://hail-datasets-extracted-data/assemblies/ucsc.hg19.fasta.fai') if args.d == 'scores': name = 'GERP_scores' ht = hl.import_table('gs://hail-datasets-extracted-data/GERP++/GERP++_scores.hg19.tsv.bgz', types={'position': hl.tint, 'N': hl.tfloat, 'S': hl.tfloat}, min_partitions=300) ht = ht.annotate(locus=hl.locus('chr' + ht['chromosome'].replace('MT', 'M'), ht['position'], 'hg19')) if args.b == 'GRCh37': hg19.add_liftover('gs://hail-datasets-extracted-data/assemblies/hg19tob37.chain.gz', 'GRCh37') ht = ht.annotate(locus=hl.liftover(ht['locus'], 'GRCh37')) if args.b == 'GRCh38': hg19.add_liftover('gs://hail-datasets-extracted-data/assemblies/hg19ToHg38.over.chain.gz', 'GRCh38') ht = ht.annotate(locus=hl.liftover(ht['locus'], 'GRCh38')) ht = ht.filter(hl.is_defined(ht['locus'])) ht = ht.select('locus', 'N', 'S') ht = ht.key_by('locus') if args.d == 'elements': name = 'GERP_elements' ht = hl.import_table('gs://hail-datasets-extracted-data/GERP++/GERP++_elements.hg19.tsv.bgz', types={'start': hl.tint, 'end': hl.tint, 'S': hl.tfloat, 'p_value': hl.tfloat}) ht = ht.annotate(interval=hl.interval(hl.locus(ht['chromosome'], ht['start'], 'hg19'), hl.locus(ht['chromosome'], ht['end'], 'hg19'))) if args.b == 'GRCh37': hg19.add_liftover('gs://hail-datasets-extracted-data/assemblies/hg19tob37.chain.gz', 'GRCh37')
import hail as hl mt = hl.read_matrix_table( 'gs://hail-datasets/hail-data/gtex_v7_exon_read_counts.GRCh37.mt') b37 = hl.get_reference('GRCh37') b37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', 'GRCh38') mt = mt.annotate_rows(interval=hl.liftover(mt.interval, 'GRCh38')) mt.describe() mt.write( 'gs://hail-datasets/hail-data/gtex_v7_exon_read_counts.GRCh38.liftover.mt', overwrite=True)
'position': hl.tint, 'N': hl.tfloat, 'S': hl.tfloat }) hg19 = hl.ReferenceGenome.from_fasta_file( 'hg19', 'gs://hail-datasets/raw-data/assemblies/ucsc.hg19.fasta.gz', 'gs://hail-datasets/raw-data/assemblies/ucsc.hg19.fasta.fai') hg19.add_liftover('gs://hail-datasets/raw-data/assemblies/hg19tob37.chain.gz', 'GRCh37') ht = ht.annotate(locus=hl.locus(ht.contig, ht.position, 'hg19')) ht.write('hdfs:///tmp/tmp.ht', overwrite=True) ht = hl.read_table('hdfs:///tmp/gerp_scores.hg19.ht') ht = ht.annotate(locus=hl.liftover(ht.locus, 'GRCh37')) ht = ht.filter(hl.is_defined(ht.locus), keep=True) ht = ht.select(ht.locus, ht.N, ht.N) ht = ht.key_by(ht.locus) n_rows = ht.count() n_partitions = ht.n_partitions() ht = ht.annotate_globals(name=name, version=version, reference_genome=reference_genome, n_rows=n_rows, n_partitions=n_partitions) ht.describe() ht.write('gs://hail-datasets/hail-data/{n}.{rg}.ht'.format(
import hail as hl mt = hl.read_matrix_table( 'gs://hail-datasets/hail-data/gtex_v7_eqtl_significant_associations.GRCh37.mt' ) mt.describe() b37 = hl.get_reference('GRCh37') b37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', 'GRCh38') mt = mt.annotate_rows(liftover_locus=hl.liftover(mt.locus, 'GRCh38')) mt = mt.filter_rows(hl.is_defined(mt.liftover_locus), keep=True) mt = mt.partition_rows_by(['liftover_locus'], 'liftover_locus', 'alleles', 'gene_id') mt = mt.drop(mt.locus) mt = mt.rename({'liftover_locus': 'locus'}) mt.describe() mt.write( 'gs://hail-datasets/hail-data/gtex_v7_eqtl_significant_associations.GRCh38.liftover.mt', overwrite=True)
build = args.b ht = hl.import_table(f'{raw_data_root}/DANN_GRCh37.tsv.bgz', types={ 'position': hl.tint, 'DANN_score': hl.tfloat }) ht = ht.annotate(locus=hl.locus(ht['chromosome'], ht['position'], 'GRCh37'), alleles=hl.array([ht['ref'], ht['alt']])) if build == 'GRCh38': b37 = hl.get_reference('GRCh37') b37.add_liftover( 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', 'GRCh38') ht = ht.annotate(locus=hl.liftover(ht['locus'], 'GRCh38')) ht = ht.filter(hl.is_defined(ht['locus'])) n_rows = ht.count() n_partitions = ht.n_partitions() ht = ht.key_by('locus', 'alleles') ht = ht.rename({'DANN_score': 'score'}) ht = ht.select('score') ht = ht.annotate_globals(metadata=hl.struct(name=name, version=hl.null(hl.tstr), reference_genome=build, n_rows=n_rows, n_partitions=n_partitions))
def main(args): ht_snp = hl.import_table(args.snp, impute=True) ht_snp = ht_snp.annotate(variant=hl.delimit([ ht_snp.chromosome, hl.str(ht_snp.position), ht_snp.allele1, ht_snp.allele2 ], delimiter=':')) ht_snp = ht_snp.annotate( **hl.parse_variant(ht_snp.variant, reference_genome='GRCh38')) ht_snp = ht_snp.key_by('locus', 'alleles') ht_snp = ht_snp.add_index('idx_snp') ht_snp = ht_snp.checkpoint(new_temp_file()) # annotate vep gnomad = hl.read_table( 'gs://gnomad-public-requester-pays/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht' ) ht_snp = ht_snp.join(gnomad.select('vep'), how='left') ht_snp = process_consequences(ht_snp) # extract most severe ht_snp = ht_snp.annotate(vep=(hl.case().when( hl.is_defined(ht_snp.vep.worst_csq_for_variant_canonical), ht_snp.vep.worst_csq_for_variant_canonical).when( hl.is_defined(ht_snp.vep.worst_csq_for_variant), ht_snp.vep.worst_csq_for_variant).or_missing()), is_canonical_vep=hl.is_defined( ht_snp.vep.worst_csq_for_variant_canonical)) ht_snp = ht_snp.annotate(most_severe=hl.if_else( hl.is_defined(ht_snp.vep), ht_snp.vep.most_severe_consequence, 'intergenic_variant'), gene_most_severe=ht_snp.vep.gene_symbol) ht_snp = ht_snp.select_globals() ht_snp = ht_snp.drop('vep') ht_snp = ht_snp.annotate( **annotate_consequence_category(ht_snp.most_severe)) ht_snp = ht_snp.checkpoint(new_temp_file()) df = ht_snp.key_by().drop('locus', 'alleles', 'variant', 'idx_snp').to_pandas() # annotate LD for pop in POPS: ht = hl.read_table( f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.adj.ld.variant_indices.ht' ) ht = ht.annotate(locus_hg38=hl.liftover(ht.locus, 'GRCh38')) ht = ht.filter(hl.is_defined(ht.locus_hg38)) ht = ht.key_by('locus_hg38', 'alleles').drop('locus') ht = ht_snp.join(ht, 'inner') ht = ht.checkpoint(new_temp_file()) lead_idx = ht.order_by(hl.desc(ht.prob)).head(1).idx.collect() idx = ht.idx.collect() bm = BlockMatrix.read( f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.ld.bm' ) bm = bm.filter(idx, idx) # re-densify triangluar matrix bm = bm + bm.T - get_diag_mat(bm.diagonal()) bm = bm.filter_rows( np.where(np.array(idx) == lead_idx[0])[0].tolist())**2 idx_snp = ht.idx_snp.collect() r2 = bm.to_numpy()[0] df[f'gnomad_lead_r2_{pop}'] = np.nan df[f'gnomad_lead_r2_{pop}'].iloc[idx_snp] = r2 if args.out.startswith('gs://'): fopen = hl.hadoop_open else: fopen = open with fopen(args.out, 'w') as f: df.to_csv(f, sep='\t', na_rep='NA', index=False)
def main(gnomad_file, chain_file, out_folder, test=None): # Output files: out_parquet = f'{out_folder}/variant-annotation.parquet' # Load data ht = hl.read_table(gnomad_file) # If process is being tested, take head: if test: ht = ht.head(test) # Assert that all alleles are biallelic: assert ht.all(ht.alleles.length() == 2), 'Mono- or multiallelic variants have been found.' # Extracting AF indices of populations: population_indices = ht.globals.freq_index_dict.collect()[0] population_indices = {pop: population_indices[f'{pop}-adj'] for pop in POPULATIONS} # Generate struct for alt. allele frequency in selected populations: ht = ht.annotate(af=hl.struct(**{pop: ht.freq[index].AF for pop, index in population_indices.items()})) # Add chain file grch37 = hl.get_reference('GRCh37') grch38 = hl.get_reference('GRCh38') grch38.add_liftover(chain_file, grch37) # Liftover ht = ht.annotate( locus_GRCh37=hl.liftover(ht.locus, 'GRCh37') ) # Adding build-specific coordinates to the table: ht = ht.annotate( chrom_b38=ht.locus.contig.replace('chr', ''), pos_b38=ht.locus.position, chrom_b37=ht.locus_GRCh37.contig.replace('chr', ''), pos_b37=ht.locus_GRCh37.position, ref=ht.alleles[0], alt=ht.alleles[1], allele_type=ht.allele_info.allele_type ) # Updating table: ht = ht.annotate( # Updating CADD column: cadd=ht.cadd.rename({'raw_score': 'raw'}).drop('has_duplicate'), # Adding locus as new column: locus_GRCh38=ht.locus ) # Drop all global annotations: ht = ht.select_globals() # Drop unnecessary VEP fields ht = ht.annotate( vep=ht.vep.drop( 'assembly_name', 'allele_string', 'ancestral', 'context', 'end', 'id', 'input', 'intergenic_consequences', 'seq_region_name', 'start', 'strand', 'variant_class' ) ) # Sort columns col_order = [ 'locus_GRCh38', 'chrom_b38', 'pos_b38', 'chrom_b37', 'pos_b37', 'ref', 'alt', 'allele_type', 'vep', 'rsid', 'af', 'cadd', 'filters' ] # Repartition and write parquet file ( ht .select(*col_order) .to_spark(flatten=False) .coalesce(OUT_PARTITIONS) .write.mode('overwrite').parquet(out_parquet) )
import hail as hl ht = hl.read_table( 'gs://hail-datasets/hail-data/gerp_scores.GRCh37.liftover.ht') b37 = hl.get_reference('GRCh37') b37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', 'GRCh38') ht = ht.annotate(liftover_locus=hl.liftover(ht.locus, 'GRCh38')) ht = ht.filter(hl.is_defined(ht.liftover_locus), keep=True) ht = ht.key_by(ht.liftover_locus) ht = ht.drop('locus') ht = ht.rename({'liftover_locus': 'locus'}) ht.describe() ht.write('gs://hail-datasets/hail-data/gerp_scores.GRCh38.liftover.ht', overwrite=True)
def main(): # Args (global) chain_file = 'gs://hail-common/references/grch37_to_grch38.over.chain.gz' inf = 'gs://genetics-portal-raw/uk_biobank_sumstats/variant_sitelist/ukbiobank_neale_saige_sitelist.190321.tsv' in_ensembl = 'gs://genetics-portal-raw/ensembl_grch37_r95/homo_sapiens-chr*.vcf.*.gz' out_parquet = 'gs://genetics-portal-raw/uk_biobank_sumstats/variant_sitelist/ukbiobank_neale_saige_sitelist.190321.annotated.parquet' # # Args (local) # chain_file = 'input_data/grch37_to_grch38.over.chain.gz' # inf = 'ukbiobank_neale_saige_sitelist.head100k.tsv' # in_ensembl = 'input_data/homo_sapiens-chr1.head.vcf' # out_parquet = 'ukbiobank_neale_saige_sitelist.annotated.parquet' # # Load sitelist ------------------------------------------------------------ # # Load data ht = hl.import_table(inf, no_header=True, min_partitions=128, types={ 'f0': 'str', 'f1': 'int32', 'f2': 'str', 'f3': 'str' }) # Rename columns ht = ht.rename({ 'f0': 'chrom_b37', 'f1': 'pos_b37', 'f2': 'ref', 'f3': 'alt' }) # Create locus and allele ht = ht.annotate(locus=hl.locus(ht.chrom_b37, ht.pos_b37, 'GRCh37'), alleles=hl.array([ht.ref, ht.alt])).key_by('locus', 'alleles') # # Do liftover -------------------------------------------------------------- # # Add chain file rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover(chain_file, rg38) # Liftover ht = ht.annotate(locus_GRCh38=hl.liftover(ht.locus, 'GRCh38')) # Convert to spark df = (ht.to_spark().withColumnRenamed( 'locus_GRCh38.contig', 'chrom_b38').withColumnRenamed('locus_GRCh38.position', 'pos_b38').drop('locus.contig', 'locus.position', 'alleles')) # # Annotate with rsids ------------------------------------------------------ # # Load ensembl ensembl = load_ensembl_vcf(in_ensembl) # Join df = df.join(ensembl, on=['chrom_b37', 'pos_b37', 'ref', 'alt'], how='left') # # Write output ------------------------------------------------------------- # # Write (df.select('chrom_b37', 'pos_b37', 'chrom_b38', 'pos_b38', 'ref', 'alt', 'rsid').write.parquet(out_parquet, mode='overwrite')) return 0
Returns the complement of a base :param str base: Base to be flipped :return: Complement of input base :rtype: str """ return (hl.switch(base).when('A', 'T').when('T', 'A').when('G', 'C').when( 'C', 'G').default(base)) mpc_ht = hl.import_table(MPC_SCORE, impute=True) mpc_ht = mpc_ht.annotate(locus=hl.locus(contig=mpc_ht.chrom, pos=mpc_ht.pos), alleles=[mpc_ht.ref, mpc_ht.alt]) mpc_ht = mpc_ht.key_by(mpc_ht.locus, mpc_ht.alleles).select('MPC') mpc_ht = mpc_ht.annotate( new_locus=hl.liftover(mpc_ht.locus, 'GRCh38', include_strand=True)) mpc_ht = mpc_ht.filter(hl.is_defined(mpc_ht.new_locus)) mpc_ht = mpc_ht.annotate(new_alleles=hl.cond( mpc_ht.new_locus.is_negative_strand, [flip_base(mpc_ht.alleles[0]), flip_base(mpc_ht.alleles[1])], mpc_ht.alleles)) mpc_ht = mpc_ht.key_by(locus=mpc_ht.new_locus.result, alleles=mpc_ht.new_alleles) # Write the result to file. mpc_ht.write( 'gs://raw_data_bipolar_dalio_w1_w2/inputs/fordist_constraint_official_mpc_values_v2_GRCh38.ht', overwrite=True)
Returns the complement of a base :param str base: Base to be flipped :return: Complement of input base :rtype: str """ return (hl.switch(base).when('A', 'T').when('T', 'A').when('G', 'C').when( 'C', 'G').default(base)) ht_bsc = hl.import_table(BSC_COUNTS, impute=True) ht_bsc = ht_bsc.annotate(locus=hl.locus(contig=ht_bsc.chrom, pos=ht_bsc.pos), alleles=[ht_bsc.ref, ht_bsc.alt]) ht_bsc = ht_bsc.key_by(ht_bsc.locus, ht_bsc.alleles) ht_bsc = ht_bsc.annotate( new_locus=hl.liftover(ht_bsc.locus, 'GRCh38', include_strand=True)) ht_bsc = ht_bsc.filter(hl.is_defined(ht_bsc.new_locus)) ht_bsc = ht_bsc.annotate(new_alleles=hl.cond( ht_bsc.new_locus.is_negative_strand, [flip_base(ht_bsc.alleles[0]), flip_base(ht_bsc.alleles[1])], ht_bsc.alleles)) ht_bsc = ht_bsc.key_by(locus=ht_bsc.new_locus.result, alleles=ht_bsc.new_alleles) # Write the result to file. ht_bsc.write( 'gs://raw_data_bipolar_dalio_w1_w2/inputs/BSC_MAC5_counts_GRCh38.ht', overwrite=True)
def liftover_intervals(t: hl.Table, keep_missing_interval: bool = False) -> hl.Table: """ Liftover locus in intervals from one coordinate system (hg37) to another (hg38) # Example input table description # # ---------------------------------------- # Global fields: # None # ---------------------------------------- # Row fields: # 'interval': interval<locus<GRCh37>> # ---------------------------------------- # Key: ['interval'] # ---------------------------------------- :param t: Table of intervals on GRCh37 :param keep_missing_interval: If True, keep missing (non-lifted) intervals in the output Table. :return: Table with intervals lifted over GRCh38 added. """ rg37 = hl.get_reference("GRCh37") rg38 = hl.get_reference("GRCh38") if not rg37.has_liftover("GRCh38"): rg37.add_liftover( f'{nfs_dir}/resources/liftover/grch37_to_grch38.over.chain.gz', rg38) t = t.annotate( start=hl.liftover(t.interval.start, "GRCh38"), end=hl.liftover(t.interval.end, "GRCh38"), ) t = t.filter((t.start.contig == "chr" + t.interval.start.contig) & (t.end.contig == "chr" + t.interval.end.contig)) t = t.key_by() t = (t.select(interval=hl.locus_interval(t.start.contig, t.start.position, t.end.position, reference_genome=rg38, invalid_missing=True), interval_hg37=t.interval)) # bad intervals missing = t.aggregate(hl.agg.counter(~hl.is_defined(t.interval))) logger.info( f"Number of missing intervals: {missing[True]} out of {t.count()}...") # update globals annotations global_ann_expr = { 'date': current_date(), 'reference_genome': 'GRCh38', 'was_lifted': True } t = t.annotate_globals(**global_ann_expr) if not keep_missing_interval: logger.info(f"Filtering out {missing[True]} missing intervals...") t = t.filter(hl.is_defined(t.interval), keep=True) return t.key_by("interval")
'Name': 'gene_id', 'x': 'TPM' }) mt = mt_counts.annotate_entries(TPM=mt_tpm[mt_counts.gene_id, mt_counts.sample_id]['TPM']) mt = mt.annotate_rows(**ht_genes[mt.gene_id]) mt = mt.annotate_cols(**ht_sample_attributes[mt.sample_id]) if reference_genome == 'GRCh38': b37 = hl.get_reference('GRCh37') b37.add_liftover( 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', 'GRCh38') mt = mt.annotate_rows( gene_interval=hl.liftover(mt['gene_interval'], 'GRCh38')) mt = mt.filter_rows(hl.is_defined(mt['gene_interval'])) mt = mt.repartition(20) elif dataset == 'transcripts': name = 'GTEx_transcript_expression' ht_transcripts = import_gtf(path=EXTRACT_BUCKET + 'GTEx/v7/GTEx_transcripts.v7.GRCh37.gtf.bgz', reference_genome='GRCh37') ht_transcripts = ht_transcripts.filter( ht_transcripts['feature'] == 'transcript') ht_transcripts = ht_transcripts.select( 'transcript_id', 'strand', 'transcript_name', 'transcript_type',