def test_reference_genome_liftover(self): grch37 = hl.get_reference('GRCh37') grch38 = hl.get_reference('GRCh38') self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37')) grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37') self.assertTrue(grch37.has_liftover('GRCh38') and grch38.has_liftover('GRCh37')) ds = hl.import_vcf(resource('sample.vcf')) t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows() self.assertTrue(t.all(t.locus == t.liftover)) null_locus = hl.null(hl.tlocus('GRCh38')) rows = [ {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')}, {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')}, {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')}, {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')}, {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')}, {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')}, {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus} ] schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38)) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38), hl.liftover(t.l37, 'GRCh38') == t.l38, hl.is_missing(hl.liftover(t.l37, 'GRCh38'))))) t = t.filter(hl.is_defined(t.l38)) self.assertTrue(t.count() == 6) t = t.key_by('l38') t.count() self.assertTrue(list(t.key) == ['l38']) null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38'))) rows = [ {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval}, {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'), 'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')} ] schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38))) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38)) grch37.remove_liftover("GRCh38") grch38.remove_liftover("GRCh37")
def test_reference_genome(self): rg = hl.get_reference('GRCh37') self.assertEqual(rg.name, "GRCh37") self.assertEqual(rg.contigs[0], "1") self.assertListEqual(rg.x_contigs, ["X"]) self.assertListEqual(rg.y_contigs, ["Y"]) self.assertListEqual(rg.mt_contigs, ["MT"]) self.assertEqual(rg.par[0], hl.eval(hl.parse_locus_interval("X:60001-2699521"))) self.assertEqual(rg.contig_length("1"), 249250621) name = "test" contigs = ["1", "X", "Y", "MT"] lengths = {"1": 10000, "X": 2000, "Y": 4000, "MT": 1000} x_contigs = ["X"] y_contigs = ["Y"] mt_contigs = ["MT"] par = [("X", 5, 1000)] gr2 = ReferenceGenome(name, contigs, lengths, x_contigs, y_contigs, mt_contigs, par) self.assertEqual(gr2.name, name) self.assertListEqual(gr2.contigs, contigs) self.assertListEqual(gr2.x_contigs, x_contigs) self.assertListEqual(gr2.y_contigs, y_contigs) self.assertListEqual(gr2.mt_contigs, mt_contigs) self.assertEqual(gr2.par, [hl.eval(hl.parse_locus_interval("X:5-1000", gr2))]) self.assertEqual(gr2.contig_length("1"), 10000) self.assertDictEqual(gr2.lengths, lengths) gr2.write("/tmp/my_gr.json")
def test_import_vcf(self): vcf = hl.split_multi_hts( hl.import_vcf(resource('sample2.vcf'), reference_genome=hl.get_reference('GRCh38'), contig_recoding={"22": "chr22"})) vcf_table = vcf.rows() self.assertTrue(vcf_table.all(vcf_table.locus.contig == "chr22")) self.assertTrue(vcf.locus.dtype, hl.tlocus('GRCh37'))
def test_liftover_strand(self): grch37 = hl.get_reference('GRCh37') grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') self.assertEqual(hl.eval(hl.liftover(hl.locus('20', 60001, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval(hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'), is_negative_strand=False))) self.assertEqual(hl.eval(hl.liftover(hl.locus_interval('20', 37007582, 37007586, True, True, 'GRCh37'), 'GRCh38', include_strand=True)), hl.eval(hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121, True, True, 'GRCh38'), is_negative_strand=True))) grch37.remove_liftover("GRCh38")
def matrix_irs(self): hl.index_bgen(resource('example.8bits.bgen'), reference_genome=hl.get_reference('GRCh37'), contig_recoding={'01': '1'}) collect = ir.MakeStruct([('x', ir.ApplyAggOp('Collect', [], None, [ir.I32(0)]))]) matrix_read = ir.MatrixRead( ir.MatrixNativeReader(resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False) table_read = ir.TableRead( ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht')), False) matrix_range = ir.MatrixRead(ir.MatrixRangeReader(1, 1, 10)) matrix_irs = [ ir.MatrixRepartition(matrix_range, 100, ir.RepartitionStrategy.SHUFFLE), ir.MatrixUnionRows(matrix_range, matrix_range), ir.MatrixDistinctByRow(matrix_range), ir.MatrixRowsHead(matrix_read, 5), ir.CastTableToMatrix( ir.CastMatrixToTable(matrix_read, '__entries', '__cols'), '__entries', '__cols', []), ir.MatrixAggregateRowsByKey(matrix_read, collect, collect), ir.MatrixAggregateColsByKey(matrix_read, collect, collect), matrix_read, matrix_range, ir.MatrixRead(ir.MatrixVCFReader(resource('sample.vcf'), ['GT'], hl.tfloat64, None, None, None, None, False, True, False, True, None, None, None)), ir.MatrixRead(ir.MatrixBGENReader(resource('example.8bits.bgen'), None, {}, 10, 1, None)), ir.MatrixFilterRows(matrix_read, ir.FalseIR()), ir.MatrixFilterCols(matrix_read, ir.FalseIR()), ir.MatrixFilterEntries(matrix_read, ir.FalseIR()), ir.MatrixChooseCols(matrix_read, [1, 0]), ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']), ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False), ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixCollectColsByKey(matrix_read), ir.MatrixExplodeRows(matrix_read, ['row_aset']), ir.MatrixExplodeCols(matrix_read, ['col_aset']), ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo'), ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'), ir.MatrixToMatrixApply(matrix_read, {'name': 'MatrixFilterPartitions', 'parts': [0], 'keep': True}) ] return matrix_irs
def test_import_plink_contig_recoding_w_reference(self): vcf = hl.split_multi_hts( hl.import_vcf(resource('sample2.vcf'), reference_genome=hl.get_reference('GRCh38'), contig_recoding={"22": "chr22"})) hl.export_plink(vcf, '/tmp/sample_plink') bfile = '/tmp/sample_plink' plink = hl.import_plink( bfile + '.bed', bfile + '.bim', bfile + '.fam', a2_reference=True, contig_recoding={'chr22': '22'}, reference_genome='GRCh37').rows() self.assertTrue(plink.all(plink.locus.contig == "22")) self.assertEqual(vcf.count_rows(), plink.count()) self.assertTrue(plink.locus.dtype, hl.tlocus('GRCh37'))
def test_classes(self): l = Locus.parse('1:100') self.assertEqual(l, Locus('1', 100)) self.assertEqual(l, Locus(1, 100)) self.assertEqual(l.reference_genome, hl.default_reference()) c_hom_ref = Call([0, 0]) self.assertEqual(c_hom_ref.alleles, [0, 0]) self.assertEqual(c_hom_ref.ploidy, 2) self.assertFalse(c_hom_ref.phased) self.assertFalse(c_hom_ref.is_haploid()) self.assertTrue(c_hom_ref.is_diploid()) self.assertEqual(c_hom_ref.n_alt_alleles(), 0) self.assertTrue(c_hom_ref.one_hot_alleles(2) == [2, 0]) self.assertTrue(c_hom_ref.is_hom_ref()) self.assertFalse(c_hom_ref.is_het()) self.assertFalse(c_hom_ref.is_hom_var()) self.assertFalse(c_hom_ref.is_non_ref()) self.assertFalse(c_hom_ref.is_het_non_ref()) self.assertFalse(c_hom_ref.is_het_ref()) self.assertTrue(c_hom_ref.unphased_diploid_gt_index() == 0) c_het_phased = Call([1, 0], phased=True) self.assertEqual(c_het_phased.alleles, [1, 0]) self.assertEqual(c_het_phased.ploidy, 2) self.assertTrue(c_het_phased.phased) self.assertFalse(c_het_phased.is_haploid()) self.assertTrue(c_het_phased.is_diploid()) self.assertEqual(c_het_phased.n_alt_alleles(), 1) self.assertTrue(c_het_phased.one_hot_alleles(2) == [1, 1]) self.assertFalse(c_het_phased.is_hom_ref()) self.assertTrue(c_het_phased.is_het()) self.assertFalse(c_het_phased.is_hom_var()) self.assertTrue(c_het_phased.is_non_ref()) self.assertFalse(c_het_phased.is_het_non_ref()) self.assertTrue(c_het_phased.is_het_ref()) c_hom_var = Call([1, 1]) self.assertEqual(c_hom_var.alleles, [1, 1]) self.assertEqual(c_hom_var.ploidy, 2) self.assertFalse(c_hom_var.phased) self.assertFalse(c_hom_var.is_haploid()) self.assertTrue(c_hom_var.is_diploid()) self.assertEqual(c_hom_var.n_alt_alleles(), 2) self.assertTrue(c_hom_var.one_hot_alleles(2) == [0, 2]) self.assertFalse(c_hom_var.is_hom_ref()) self.assertFalse(c_hom_var.is_het()) self.assertTrue(c_hom_var.is_hom_var()) self.assertTrue(c_hom_var.is_non_ref()) self.assertFalse(c_hom_var.is_het_non_ref()) self.assertFalse(c_hom_var.is_het_ref()) self.assertTrue(c_hom_var.unphased_diploid_gt_index() == 2) c_haploid = Call([2], phased=True) self.assertEqual(c_haploid.alleles, [2]) self.assertEqual(c_haploid.ploidy, 1) self.assertTrue(c_haploid.phased) self.assertTrue(c_haploid.is_haploid()) self.assertFalse(c_haploid.is_diploid()) self.assertEqual(c_haploid.n_alt_alleles(), 1) self.assertTrue(c_haploid.one_hot_alleles(3) == [0, 0, 1]) self.assertFalse(c_haploid.is_hom_ref()) self.assertFalse(c_haploid.is_het()) self.assertTrue(c_haploid.is_hom_var()) self.assertTrue(c_haploid.is_non_ref()) self.assertFalse(c_haploid.is_het_non_ref()) self.assertFalse(c_haploid.is_het_ref()) c_zeroploid = Call([]) self.assertEqual(c_zeroploid.alleles, []) self.assertEqual(c_zeroploid.ploidy, 0) self.assertFalse(c_zeroploid.phased) self.assertFalse(c_zeroploid.is_haploid()) self.assertFalse(c_zeroploid.is_diploid()) self.assertEqual(c_zeroploid.n_alt_alleles(), 0) self.assertTrue(c_zeroploid.one_hot_alleles(3) == [0, 0, 0]) self.assertFalse(c_zeroploid.is_hom_ref()) self.assertFalse(c_zeroploid.is_het()) self.assertFalse(c_zeroploid.is_hom_var()) self.assertFalse(c_zeroploid.is_non_ref()) self.assertFalse(c_zeroploid.is_het_non_ref()) self.assertFalse(c_zeroploid.is_het_ref()) self.assertRaisesRegex( NotImplementedError, "Calls with greater than 2 alleles are not supported.", Call, [1, 1, 1, 1]) rg = hl.get_reference('GRCh37') self.assertEqual(rg.name, "GRCh37") self.assertEqual(rg.contigs[0], "1") self.assertListEqual(rg.x_contigs, ["X"]) self.assertListEqual(rg.y_contigs, ["Y"]) self.assertListEqual(rg.mt_contigs, ["MT"]) self.assertEqual(rg.par[0], hl.parse_locus_interval("X:60001-2699521").value) self.assertEqual(rg.contig_length("1"), 249250621) name = "test" contigs = ["1", "X", "Y", "MT"] lengths = {"1": 10000, "X": 2000, "Y": 4000, "MT": 1000} x_contigs = ["X"] y_contigs = ["Y"] mt_contigs = ["MT"] par = [("X", 5, 1000)] gr2 = ReferenceGenome(name, contigs, lengths, x_contigs, y_contigs, mt_contigs, par) self.assertEqual(gr2.name, name) self.assertListEqual(gr2.contigs, contigs) self.assertListEqual(gr2.x_contigs, x_contigs) self.assertListEqual(gr2.y_contigs, y_contigs) self.assertListEqual(gr2.mt_contigs, mt_contigs) self.assertEqual(gr2.par, [hl.parse_locus_interval("X:5-1000", gr2).value]) self.assertEqual(gr2.contig_length("1"), 10000) self.assertDictEqual(gr2.lengths, lengths) gr2.write("/tmp/my_gr.json") gr3 = ReferenceGenome.read(resource("fake_ref_genome.json")) self.assertEqual(gr3.name, "my_reference_genome") self.assertFalse(gr3.has_sequence()) gr4 = ReferenceGenome.from_fasta_file( "test_rg", resource("fake_reference.fasta"), resource("fake_reference.fasta.fai"), mt_contigs=["b", "c"], x_contigs=["a"]) self.assertTrue(gr4.has_sequence()) self.assertTrue(gr4.x_contigs == ["a"]) t = hl.import_table(resource("fake_reference.tsv"), impute=True) self.assertTrue(t.all(hl.get_sequence(gr4, t.contig, t.pos) == t.base)) l = hl.locus("a", 7, gr4) self.assertTrue( l.sequence_context(before=3, after=3).value == "TTTCGAA")
def test_matrix_ir_parses(self): hl.index_bgen(resource('example.8bits.bgen'), reference_genome=hl.get_reference('GRCh37'), contig_recoding={'01': '1'}) collect = ir.MakeStruct([('x', ir.ApplyAggOp('Collect', [], None, [ir.I32(0)]))]) matrix_read = ir.MatrixRead( ir.MatrixNativeReader( resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False) table_read = ir.TableRead( ir.TableNativeReader( resource('backward_compatability/1.0.0/table/0.ht')), False) matrix_range = ir.MatrixRead(ir.MatrixRangeReader(1, 1, 10)) matrix_irs = [ ir.MatrixRepartition(matrix_range, 100, ir.RepartitionStrategy.SHUFFLE), ir.MatrixUnionRows(matrix_range, matrix_range), ir.MatrixDistinctByRow(matrix_range), ir.CastTableToMatrix( ir.CastMatrixToTable(matrix_read, '__entries', '__cols'), '__entries', '__cols', []), ir.MatrixAggregateRowsByKey(matrix_read, collect, collect), ir.MatrixAggregateColsByKey(matrix_read, collect, collect), matrix_read, matrix_range, ir.MatrixRead( ir.MatrixVCFReader(resource('sample.vcf'), ['GT'], hl.tfloat64, None, None, None, None, False, True, False, True, None)), ir.MatrixRead( ir.MatrixBGENReader(resource('example.8bits.bgen'), None, {}, 10, 1, None)), ir.MatrixFilterRows(matrix_read, ir.FalseIR()), ir.MatrixFilterCols(matrix_read, ir.FalseIR()), ir.MatrixFilterEntries(matrix_read, ir.FalseIR()), ir.MatrixChooseCols(matrix_read, [1, 0]), ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']), ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False), ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.TableToMatrixTable(table_read, ['f32', 'i64'], ['m', 'astruct'], ['aset'], ['mset'], 100), ir.MatrixCollectColsByKey(matrix_read), ir.MatrixExplodeRows(matrix_read, ['row_aset']), ir.MatrixExplodeCols(matrix_read, ['col_aset']), ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo'), ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'), ir.MatrixToMatrixApply(matrix_read, { 'name': 'MatrixFilterPartitions', 'parts': [0], 'keep': True }) ] for x in matrix_irs: try: Env.hail().expr.ir.IRParser.parse_matrix_ir(str(x)) except Exception as e: raise ValueError(str(x)) from e
import json import re from hail.typecheck import typecheck_method, sequenceof, dictof, oneof, \ sized_tupleof, nullable, transformed, lazy from hail.utils.misc import wrap_to_list from hail.utils.java import Env import hail as hl rg_type = lazy() reference_genome_type = oneof( transformed((str, lambda x: hl.get_reference(x))), rg_type) class ReferenceGenome(object): """An object that represents a `reference genome <https://en.wikipedia.org/wiki/Reference_genome>`__. Examples -------- >>> contigs = ["1", "X", "Y", "MT"] >>> lengths = {"1": 249250621, "X": 155270560, "Y": 59373566, "MT": 16569} >>> par = [("X", 60001, 2699521)] >>> my_ref = hl.ReferenceGenome("my_ref", contigs, lengths, "X", "Y", "MT", par) Notes ----- Hail comes with predefined reference genomes (case sensitive!): - GRCh37, Genome Reference Consortium Human Build 37 - GRCh38, Genome Reference Consortium Human Build 38 - GRCm38, Genome Reference Consortium Mouse Build 38
print("arguments", sys.argv) #Arguments from cloudspan lambda bucket_name = sys.argv[1] pipeline_run_vcf = sys.argv[2] cohort_prefix = sys.argv[3] reference_build = sys.argv[4] giab_bucket = sys.argv[5] # Load GiAB VCF, split multi-allelic sites, and store as MatrixTable build_37 = ["GRCh37", "37", "hg19"] if reference_build in build_37: reference = hl.get_reference('GRCh37') else: reference = hl.get_reference('GRCh38') giab_gs_path = '{}/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.bgz'.format( giab_bucket) giab_ds = hl.import_vcf(path=giab_gs_path, reference_genome=reference) giab_ds = split_multi(giab_ds) giab_ds.describe() # Load pipeline-run VCF, split multi-allelic sites, and store as MatrixTable pipeline_gs_path = '{}{}'.format(bucket_name, pipeline_run_vcf) pipeline_ds = hl.import_vcf(path=pipeline_gs_path, reference_genome=reference) pipeline_ds = split_multi(pipeline_ds)
import hail as hl ht = hl.read_table( 'gs://hail-datasets/hail-data/gerp_scores.GRCh37.liftover.ht') b37 = hl.get_reference('GRCh37') b37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', 'GRCh38') ht = ht.annotate(liftover_locus=hl.liftover(ht.locus, 'GRCh38')) ht = ht.filter(hl.is_defined(ht.liftover_locus), keep=True) ht = ht.key_by(ht.liftover_locus) ht = ht.drop('locus') ht = ht.rename({'liftover_locus': 'locus'}) ht.describe() ht.write('gs://hail-datasets/hail-data/gerp_scores.GRCh38.liftover.ht', overwrite=True)
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :class:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +NOTEST ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint}, missing='.', delimiter='\t') ht = ht.rename({'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute'}) ht = ht.annotate(attribute=hl.dict( hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht['attribute'].split('; ')))) attributes = ht.aggregate(hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute(**{x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x}) if reference_genome: if reference_genome == 'GRCh37': ht = ht.annotate(seqname=ht['seqname'].replace('^chr', '')) else: ht = ht.annotate(seqname=hl.case() .when(ht['seqname'].startswith('HLA'), ht['seqname']) .when(ht['seqname'].startswith('chrHLA'), ht['seqname'].replace('^chr', '')) .when(ht['seqname'].startswith('chr'), ht['seqname']) .default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal(set(hl.get_reference(reference_genome).contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute(interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval(hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
def new_combiner( *, output_path: str, temp_path: str, save_path: Optional[str] = None, gvcf_paths: Optional[List[str]] = None, vds_paths: Optional[List[str]] = None, vds_sample_counts: Optional[List[int]] = None, intervals: Optional[List[Interval]] = None, import_interval_size: Optional[int] = None, use_genome_default_intervals: bool = False, use_exome_default_intervals: bool = False, gvcf_external_header: Optional[str] = None, gvcf_sample_names: Optional[List[str]] = None, gvcf_info_to_keep: Optional[Collection[str]] = None, gvcf_reference_entry_fields_to_keep: Optional[Collection[str]] = None, branch_factor: int = VariantDatasetCombiner.default_branch_factor, target_records: int = VariantDatasetCombiner.default_target_records, batch_size: int = VariantDatasetCombiner.default_gvcf_batch_size, reference_genome: Union[str, hl.ReferenceGenome] = 'default', contig_recoding: Optional[Dict[str, str]] = None, force: bool = False, ) -> VariantDatasetCombiner: if not (gvcf_paths or vds_paths): raise ValueError( "at least one of 'gvcf_paths' or 'vds_paths' must be nonempty") if gvcf_paths is None: gvcf_paths = [] if vds_paths is None: vds_paths = [] if vds_sample_counts is not None and len(vds_paths) != len( vds_sample_counts): raise ValueError( "'vds_paths' and 'vds_sample_counts' (if present) must have the same length " f'{len(vds_paths)} != {len(vds_sample_counts)}') if (gvcf_sample_names is None) != (gvcf_external_header is None): raise ValueError( "both 'gvcf_sample_names' and 'gvcf_external_header' must be set or unset" ) if gvcf_sample_names is not None and len(gvcf_sample_names) != len( gvcf_paths): raise ValueError( "'gvcf_sample_names' and 'gvcf_paths' must have the same length " f'{len(gvcf_sample_names)} != {len(gvcf_paths)}') n_partition_args = (int(intervals is not None) + int(import_interval_size is not None) + int(use_genome_default_intervals) + int(use_exome_default_intervals)) if n_partition_args == 0: raise ValueError( "'new_combiner': require one argument from 'intervals', 'import_interval_size', " "'use_genome_default_intervals', or 'use_exome_default_intervals' to choose GVCF partitioning" ) def maybe_load_from_saved_path( save_path: str) -> Optional[VariantDatasetCombiner]: if force: return None fs = hl.current_backend().fs if fs.exists(save_path): try: combiner = load_combiner(save_path) warning( f'found existing combiner plan at {save_path}, using it') # we overwrite these values as they are serialized, but not part of the # hash for an autogenerated name and we want users to be able to overwrite # these when resuming a combine (a common reason to need to resume a combine # is a failure due to branch factor being too large) combiner.branch_factor = branch_factor combiner.target_records = target_records combiner.gvcf_batch_size = batch_size return combiner except (ValueError, TypeError, OSError, KeyError): warning( f'file exists at {save_path}, but it is not a valid combiner plan, overwriting' ) return None # We do the first save_path check now after validating the arguments if save_path is not None: saved_combiner = maybe_load_from_saved_path(save_path) if saved_combiner is not None: return saved_combiner if n_partition_args > 1: warning( "'run_combiner': multiple colliding arguments found from 'intervals', 'import_interval_size', " "'use_genome_default_intervals', or 'use_exome_default_intervals'." "\n The argument found first in the list in this warning will be used, and others ignored." ) if intervals is not None: pass elif import_interval_size is not None: intervals = calculate_even_genome_partitioning(reference_genome, import_interval_size) elif use_genome_default_intervals: size = VariantDatasetCombiner.default_genome_interval_size intervals = calculate_even_genome_partitioning(reference_genome, size) elif use_exome_default_intervals: size = VariantDatasetCombiner.default_exome_interval_size intervals = calculate_even_genome_partitioning(reference_genome, size) assert intervals is not None if isinstance(reference_genome, str): reference_genome = hl.get_reference(reference_genome) if gvcf_reference_entry_fields_to_keep is None and vds_paths: vds = hl.vds.read_vds(vds_paths[0]) gvcf_reference_entry_fields_to_keep = set( vds.reference_data.entry) - {'END'} elif gvcf_reference_entry_fields_to_keep is None and gvcf_paths: mt = hl.import_vcf(gvcf_paths[0], force_bgz=True, reference_genome=reference_genome) mt = mt.filter_rows(hl.is_defined(mt.info.END)) gvcf_reference_entry_fields_to_keep = defined_entry_fields( mt, 100_000) - {'GT', 'PGT', 'PL'} if save_path is None: sha = hashlib.sha256() sha.update(output_path.encode()) sha.update(temp_path.encode()) sha.update(str(reference_genome).encode()) for path in vds_paths: sha.update(path.encode()) for path in gvcf_paths: sha.update(path.encode()) if gvcf_external_header is not None: sha.update(gvcf_external_header.encode()) if gvcf_sample_names is not None: for name in gvcf_sample_names: sha.update(name.encode()) if gvcf_info_to_keep is not None: for kept_info in sorted(gvcf_info_to_keep): sha.update(kept_info.encode()) if gvcf_reference_entry_fields_to_keep is not None: for field in sorted(gvcf_reference_entry_fields_to_keep): sha.update(field.encode()) if contig_recoding is not None: for key, value in sorted(contig_recoding.items()): sha.update(key.encode()) sha.update(value.encode()) for interval in intervals: sha.update(str(interval).encode()) digest = sha.hexdigest() name = f'vds-combiner-plan_{digest}_{hl.__pip_version__}.json' save_path = os.path.join(temp_path, 'combiner-plans', name) saved_combiner = maybe_load_from_saved_path(save_path) if saved_combiner is not None: return saved_combiner else: warning(f'generated combiner save path of {save_path}') if vds_sample_counts: vdses = [ VDSMetadata(path, n_samples) for path, n_samples in zip(vds_paths, vds_sample_counts) ] else: vdses = [] for path in vds_paths: vds = hl.vds.read_vds(path) n_samples = vds.n_samples() vdses.append(VDSMetadata(path, n_samples)) vdses.sort(key=lambda x: x.n_samples, reverse=True) return VariantDatasetCombiner( save_path=save_path, output_path=output_path, temp_path=temp_path, reference_genome=reference_genome, branch_factor=branch_factor, target_records=target_records, gvcf_batch_size=batch_size, contig_recoding=contig_recoding, vdses=vdses, gvcfs=gvcf_paths, gvcf_import_intervals=intervals, gvcf_external_header=gvcf_external_header, gvcf_sample_names=gvcf_sample_names, gvcf_info_to_keep=gvcf_info_to_keep, gvcf_reference_entry_fields_to_keep=gvcf_reference_entry_fields_to_keep )
def test_reference_genome_liftover(self): grch37 = hl.get_reference('GRCh37') grch38 = hl.get_reference('GRCh38') self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37')) grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37') assert grch37.has_liftover('GRCh38') assert grch38.has_liftover('GRCh37') self.assertEquals( grch37._liftovers, {'GRCh38': resource('grch37_to_grch38_chr20.over.chain.gz')}) self.assertEquals( grch38._liftovers, {'GRCh37': resource('grch38_to_grch37_chr20.over.chain.gz')}) ds = hl.import_vcf(resource('sample.vcf')) t = ds.annotate_rows(liftover=hl.liftover( hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows() assert t.all(t.locus == t.liftover) null_locus = hl.missing(hl.tlocus('GRCh38')) rows = [{ 'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus }, { 'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus }, { 'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38') }, { 'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38') }, { 'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38') }, { 'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus }, { 'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus }, { 'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus }, { 'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38') }, { 'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38') }, { 'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38') }, { 'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus }] schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38)) t = hl.Table.parallelize(rows, schema) self.assertTrue( t.all( hl.if_else(hl.is_defined(t.l38), hl.liftover(t.l37, 'GRCh38') == t.l38, hl.is_missing(hl.liftover(t.l37, 'GRCh38'))))) t = t.filter(hl.is_defined(t.l38)) self.assertTrue(t.count() == 6) t = t.key_by('l38') t.count() self.assertTrue(list(t.key) == ['l38']) null_locus_interval = hl.missing(hl.tinterval(hl.tlocus('GRCh38'))) rows = [{ 'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval }, { 'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'), 'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38') }] schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38))) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38)) grch37.remove_liftover("GRCh38") grch38.remove_liftover("GRCh37")
from hail.typecheck import * from hail.utils import wrap_to_list from hail.utils.java import jiterable_to_list, Env, joption from hail.typecheck import oneof, transformed import hail as hl rg_type = lazy() reference_genome_type = oneof(transformed((str, lambda x: hl.get_reference(x))), rg_type) class ReferenceGenome(object): """An object that represents a `reference genome <https://en.wikipedia.org/wiki/Reference_genome>`__. Examples -------- >>> contigs = ["1", "X", "Y", "MT"] >>> lengths = {"1": 249250621, "X": 155270560, "Y": 59373566, "MT": 16569} >>> par = [("X", 60001, 2699521)] >>> my_ref = hl.ReferenceGenome("my_ref", contigs, lengths, "X", "Y", "MT", par) Notes ----- Hail comes with predefined reference genomes (case sensitive!): - GRCh37 - GRCh38 - GRCm38 You can access these reference genome objects using :func:`.get_reference`:
def matrix_irs(self): hl.index_bgen(resource('example.8bits.bgen'), reference_genome=hl.get_reference('GRCh37'), contig_recoding={'01': '1'}) collect = ir.MakeStruct([('x', ir.ApplyAggOp('Collect', [], None, [ir.I32(0)]))]) matrix_read = ir.MatrixRead( ir.MatrixNativeReader( resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False) table_read = ir.TableRead( ir.TableNativeReader( resource('backward_compatability/1.0.0/table/0.ht')), False) matrix_range = ir.MatrixRead(ir.MatrixRangeReader(1, 1, 10)) matrix_irs = [ ir.MatrixRepartition(matrix_range, 100, ir.RepartitionStrategy.SHUFFLE), ir.MatrixUnionRows(matrix_range, matrix_range), ir.MatrixDistinctByRow(matrix_range), ir.MatrixRowsHead(matrix_read, 5), ir.MatrixColsHead(matrix_read, 5), ir.CastTableToMatrix( ir.CastMatrixToTable(matrix_read, '__entries', '__cols'), '__entries', '__cols', []), ir.MatrixAggregateRowsByKey(matrix_read, collect, collect), ir.MatrixAggregateColsByKey(matrix_read, collect, collect), matrix_read, matrix_range, ir.MatrixRead( ir.MatrixVCFReader(resource('sample.vcf'), ['GT'], hl.tfloat64, None, None, None, None, False, True, False, True, None, None, None)), ir.MatrixRead( ir.MatrixBGENReader(resource('example.8bits.bgen'), None, {}, 10, 1, None)), ir.MatrixFilterRows(matrix_read, ir.FalseIR()), ir.MatrixFilterCols(matrix_read, ir.FalseIR()), ir.MatrixFilterEntries(matrix_read, ir.FalseIR()), ir.MatrixChooseCols(matrix_read, [1, 0]), ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']), ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False), ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixCollectColsByKey(matrix_read), ir.MatrixExplodeRows(matrix_read, ['row_aset']), ir.MatrixExplodeCols(matrix_read, ['col_aset']), ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo'), ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'), ir.MatrixToMatrixApply(matrix_read, { 'name': 'MatrixFilterPartitions', 'parts': [0], 'keep': True }), ir.MatrixRename(matrix_read, {'global_f32': 'global_foo'}, {'col_f32': 'col_foo'}, {'row_aset': 'row_aset2'}, {'entry_f32': 'entry_foo'}), ir.MatrixFilterIntervals(matrix_read, [ hl.utils.Interval(hl.utils.Struct(row_idx=0), hl.utils.Struct(row_idx=10)) ], hl.tstruct(row_idx=hl.tint32), keep=False), ] return matrix_irs
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None, force_bgz=False, force=False) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :class:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +SKIP_OUTPUT_CHECK ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). force_bgz : :obj:`bool` If ``True``, load files as blocked gzip files, assuming that they were actually compressed using the BGZ codec. This option is useful when the file extension is not ``'.bgz'``, but the file is blocked gzip, so that the file can be read in parallel and not on a single node. force : :obj:`bool` If ``True``, load gzipped files serially on one core. This should be used only when absolutely necessary, as processing time will be increased due to lack of parallelism. Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint}, missing='.', delimiter='\t', force_bgz=force_bgz, force=force) ht = ht.rename({'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute'}) ht = ht.annotate(attribute=hl.dict( hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht['attribute'].split('; ')))) attributes = ht.aggregate(hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute(**{x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x}) if reference_genome: if reference_genome == 'GRCh37': ht = ht.annotate(seqname=ht['seqname'].replace('^chr', '')) else: ht = ht.annotate(seqname=hl.case() .when(ht['seqname'].startswith('HLA'), ht['seqname']) .when(ht['seqname'].startswith('chrHLA'), ht['seqname'].replace('^chr', '')) .when(ht['seqname'].startswith('chr'), ht['seqname']) .default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal(set(hl.get_reference(reference_genome).contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute(interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval(hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
import json import re from hail.typecheck import * from hail.utils import wrap_to_list from hail.utils.java import jiterable_to_list, Env, joption from hail.typecheck import oneof, transformed import hail as hl rg_type = lazy() reference_genome_type = oneof(transformed((str, lambda x: hl.get_reference(x))), rg_type) class ReferenceGenome(object): """An object that represents a `reference genome <https://en.wikipedia.org/wiki/Reference_genome>`__. Examples -------- >>> contigs = ["1", "X", "Y", "MT"] >>> lengths = {"1": 249250621, "X": 155270560, "Y": 59373566, "MT": 16569} >>> par = [("X", 60001, 2699521)] >>> my_ref = hl.ReferenceGenome("my_ref", contigs, lengths, "X", "Y", "MT", par) Notes ----- Hail comes with predefined reference genomes (case sensitive!): - GRCh37 - GRCh38 - GRCm38
def test_impute_sex_chromosome_ploidy(): x_par_end = 2699521 y_par_end = 2649521 rg = hl.get_reference('GRCh37') ref_blocks = [ hl.Struct(s='sample_xx', ref_allele='A', locus=hl.Locus('22', 1000000, rg), END=2000000, GQ=15, DP=5), hl.Struct(s='sample_xx', ref_allele='A', locus=hl.Locus('X', x_par_end - 10, rg), END=x_par_end + 9, GQ=18, DP=6), hl.Struct(s='sample_xx', ref_allele='A', locus=hl.Locus('X', x_par_end + 10, rg), END=x_par_end + 29, GQ=15, DP=5), hl.Struct(s='sample_xy', ref_allele='A', locus=hl.Locus('22', 1000000, rg), END=2000000, GQ=15, DP=5), hl.Struct(s='sample_xy', ref_allele='A', locus=hl.Locus('X', x_par_end - 10, rg), END=x_par_end + 9, GQ=9, DP=3), hl.Struct(s='sample_xy', ref_allele='A', locus=hl.Locus('X', x_par_end + 10, rg), END=x_par_end + 29, GQ=6, DP=2), hl.Struct(s='sample_xy', ref_allele='A', locus=hl.Locus('Y', y_par_end - 10, rg), END=y_par_end + 9, GQ=12, DP=4), hl.Struct(s='sample_xy', ref_allele='A', locus=hl.Locus('Y', y_par_end + 10, rg), END=y_par_end + 29, GQ=9, DP=3), ] ref_mt = hl.Table.parallelize(ref_blocks, schema=hl.dtype('struct{s:str,locus:locus<GRCh37>,ref_allele:str,END:int32,GQ:int32,DP:int32}')) \ .to_matrix_table(row_key=['locus'], row_fields=['ref_allele'], col_key=['s']) var_mt = hl.Table.parallelize([], schema=hl.dtype('struct{locus:locus<GRCh37>,alleles:array<str>,s:str,LA:array<int32>,LGT:call,GQ:int32}'))\ .to_matrix_table(row_key=['locus', 'alleles'], col_key=['s']) vds = hl.vds.VariantDataset(ref_mt, var_mt) calling_intervals = [ hl.parse_locus_interval('22:1000010-1000020', reference_genome='GRCh37'), hl.parse_locus_interval(f'X:{x_par_end}-{x_par_end+20}', reference_genome='GRCh37'), hl.parse_locus_interval(f'Y:{y_par_end}-{y_par_end+20}', reference_genome='GRCh37'), ] r = hl.vds.impute_sex_chromosome_ploidy(vds, calling_intervals, normalization_contig='22') assert r.collect() == [ hl.Struct(s='sample_xx', autosomal_mean_dp=5.0, x_mean_dp=5.5, x_ploidy=2.2, y_mean_dp=0.0, y_ploidy=0.0), hl.Struct(s='sample_xy', autosomal_mean_dp=5.0, x_mean_dp=2.5, x_ploidy=1.0, y_mean_dp=3.5, y_ploidy=1.4) ]