예제 #1
0
def generate_5_sample_vds():
    paths = [
        os.path.join(resource('gvcfs'), '1kg_chr22', path) for path in [
            'HG00187.hg38.g.vcf.gz', 'HG00190.hg38.g.vcf.gz',
            'HG00308.hg38.g.vcf.gz', 'HG00313.hg38.g.vcf.gz',
            'HG00320.hg38.g.vcf.gz'
        ]
    ]
    parts = [
        hl.Interval(start=hl.Struct(
            locus=hl.Locus('chr22', 1, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus(
                        'chr22',
                        hl.get_reference('GRCh38').contig_length('chr22') - 1,
                        reference_genome='GRCh38')),
                    includes_end=True)
    ]
    vcfs = hl.import_gvcfs(paths,
                           parts,
                           reference_genome='GRCh38',
                           array_elements_required=False)
    to_keep = defined_entry_fields(
        vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000)
    vds = hl.vds.combiner.combine_variant_datasets(
        [hl.vds.combiner.transform_gvcf(mt, to_keep) for mt in vcfs])
    vds.variant_data = vds.variant_data._key_rows_by_assert_sorted(
        'locus', 'alleles')
    vds.write(os.path.join(resource('vds'), '1kg_chr22_5_samples.vds'),
              overwrite=True)
예제 #2
0
    def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[], _builtin=False):
        super(ReferenceGenome, self).__init__()

        contigs = wrap_to_list(contigs)
        x_contigs = wrap_to_list(x_contigs)
        y_contigs = wrap_to_list(y_contigs)
        mt_contigs = wrap_to_list(mt_contigs)

        self._config = {
            'name': name,
            'contigs': [{'name': c, 'length': l} for c, l in lengths.items()],
            'xContigs': x_contigs,
            'yContigs': y_contigs,
            'mtContigs': mt_contigs,
            'par': [{'start': {'contig': c, 'position': s}, 'end': {'contig': c, 'position': e}} for (c, s, e) in par]
        }

        self._contigs = contigs
        self._lengths = lengths
        self._par_tuple = par
        self._par = [hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self)) for (c, s, e) in par]
        self._global_positions = None

        ReferenceGenome._references[name] = self

        if not _builtin:
            Env.backend().add_reference(self._config)

        hl.ir.register_reference_genome_functions(name)

        self._has_sequence = False
        self._liftovers = set()
예제 #3
0
    def test_filter_intervals_compound_partition_key(self):
        ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20)
        ds = (ds.annotate_rows(variant=hl.struct(locus=ds.locus, alleles=ds.alleles))
              .key_rows_by('locus', 'alleles'))

        intervals = [hl.Interval(hl.Struct(locus=hl.Locus('20', 10639222), alleles=['A', 'T']),
                                 hl.Struct(locus=hl.Locus('20', 10644700), alleles=['A', 'T']))]
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)
예제 #4
0
 def locus_interval(start, end):
     return hl.Interval(start=hl.Locus(
         contig=contig,
         position=start,
         reference_genome=reference_genome),
                        end=hl.Locus(contig=contig,
                                     position=end,
                                     reference_genome=reference_genome),
                        includes_end=True)
예제 #5
0
 def assert_rg_loaded_correctly(name):
     rg = hl.get_reference(name)
     self.assertEqual(rg.contigs, ["1", "X", "Y", "MT"])
     self.assertEqual(rg.lengths, {"1": 5, "X": 4, "Y": 3, "MT": 2})
     self.assertEqual(rg.x_contigs, ["X"])
     self.assertEqual(rg.y_contigs, ["Y"])
     self.assertEqual(rg.mt_contigs, ["MT"])
     self.assertEqual(rg.par, [
         hl.Interval(start=hl.Locus("X", 2, name),
                     end=hl.Locus("X", 4, name))
     ])
예제 #6
0
def test_gvcfs(spark, tmp_path):
    # GVCF MatrixTables are not keyed by locus and alleles, just by locus
    input_vcf = 'test-data/tabix-test-vcf/combined.chr20_18210071_18210093.g.vcf.gz'
    partitions = [
        hl.Interval(hl.Locus("chr20", 1, reference_genome='GRCh38'),
                    hl.Locus("chr20", 20000000, reference_genome='GRCh38'),
                    includes_end=True)
    ]
    hail_df = functions.from_matrix_table(
        hl.import_gvcfs([input_vcf],
                        partitions,
                        force_bgz=True,
                        reference_genome='GRCh38')[0])
    _assert_lossless_adapter(spark, tmp_path, hail_df, input_vcf, 'vcf',
                             'bigvcf')
예제 #7
0
def default_exome_intervals(reference_genome) -> List[hl.utils.Interval]:
    """create a list of locus intervals suitable for importing and merging exome gvcfs. As exomes
    are small. One partition per chromosome works well here.

    Parameters
    ----------
    reference_genome: :obj:`str` or :class:`.ReferenceGenome`, optional
        Reference genome to use. NOTE: only GRCh37 and GRCh38 references
        are supported.

    Returns
    -------
    :obj:`List[Interval]`
    """
    if reference_genome.name == 'GRCh37':
        contigs = [f'{i}' for i in range(1, 23)] + ['X', 'Y', 'MT']
    elif reference_genome.name == 'GRCh38':
        contigs = [f'chr{i}' for i in range(1, 23)] + ['chrX', 'chrY', 'chrM']
    else:
        raise ValueError(
            f"Invalid reference genome '{reference_genome.name}', only 'GRCh37' and 'GRCh38' are supported"
        )
    return [
        hl.Interval(start=hl.Locus(contig=contig,
                                   position=1,
                                   reference_genome=reference_genome),
                    end=hl.Locus.parse(f'{contig}:END',
                                       reference_genome=reference_genome),
                    includes_end=True) for contig in contigs
    ]
예제 #8
0
def test_vcf_vds_combiner_equivalence():
    import hail.experimental.vcf_combiner.vcf_combiner as vcf
    import hail.vds.combiner as vds
    _paths = ['gvcfs/HG00096.g.vcf.gz', 'gvcfs/HG00268.g.vcf.gz']
    paths = [resource(p) for p in _paths]
    parts = [
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 17821257, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 18708366, reference_genome='GRCh38')),
                    includes_end=True),
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 18708367, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 19776611, reference_genome='GRCh38')),
                    includes_end=True),
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 19776612, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 21144633, reference_genome='GRCh38')),
                    includes_end=True)
    ]
    vcfs = [mt.annotate_rows(info=mt.info.annotate(
        MQ_DP=hl.missing(hl.tint32),
        VarDP=hl.missing(hl.tint32),
        QUALapprox=hl.missing(hl.tint32)))
            for mt in hl.import_gvcfs(paths, parts, reference_genome='GRCh38',
                                      array_elements_required=False)]
    entry_to_keep = defined_entry_fields(vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) - {'GT', 'PGT', 'PL'}
    vds = vds.combine_variant_datasets([vds.transform_gvcf(mt, reference_entry_fields_to_keep=entry_to_keep) for mt in vcfs])
    smt = vcf.combine_gvcfs([vcf.transform_gvcf(mt) for mt in vcfs])
    smt_from_vds = hl.vds.to_merged_sparse_mt(vds).drop('RGQ')
    smt = smt.select_entries(*smt_from_vds.entry)  # harmonize fields and order
    smt = smt.key_rows_by('locus', 'alleles')
    assert smt._same(smt_from_vds)
예제 #9
0
    def test_multiple_files_variant_filtering(self):
        bgen_file = [
            resource('random-b.bgen'),
            resource('random-c.bgen'),
            resource('random-a.bgen')
        ]
        hl.index_bgen(bgen_file)

        alleles = ['A', 'G']

        desired_variants = [
            hl.Struct(locus=hl.Locus('20', 11), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 13), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 29), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 28), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 1), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 12), alleles=alleles),
        ]

        actual = hl.import_bgen(bgen_file, ['GT'],
                                n_partitions=10,
                                variants=desired_variants)
        self.assertEqual(actual.count_rows(), 6)

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (30, 10))

        expected = everything.filter_rows(
            hl.set(desired_variants).contains(everything.row_key))

        self.assertTrue(expected._same(actual))
예제 #10
0
def test_combiner_works():
    _paths = ['gvcfs/HG00096.g.vcf.gz', 'gvcfs/HG00268.g.vcf.gz']
    paths = [resource(p) for p in _paths]
    parts = [
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 17821257, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 18708366, reference_genome='GRCh38')),
                    includes_end=True),
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 18708367, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 19776611, reference_genome='GRCh38')),
                    includes_end=True),
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 19776612, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 21144633, reference_genome='GRCh38')),
                    includes_end=True)
    ]
    vcfs = hl.import_gvcfs(paths, parts, reference_genome='GRCh38', array_elements_required=False)
    entry_to_keep = defined_entry_fields(vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) - {'GT', 'PGT', 'PL'}
    vcfs = [transform_gvcf(mt.annotate_rows(info=mt.info.annotate(
        MQ_DP=hl.missing(hl.tint32),
        VarDP=hl.missing(hl.tint32),
        QUALapprox=hl.missing(hl.tint32))),
                           reference_entry_fields_to_keep=entry_to_keep)
            for mt in vcfs]
    comb = combine_variant_datasets(vcfs)
    assert len(parts) == comb.variant_data.n_partitions()
    comb.variant_data._force_count_rows()
    comb.reference_data._force_count_rows()
예제 #11
0
 def test_summarize_variants(self):
     mt = hl.utils.range_matrix_table(3, 3)
     variants = hl.literal({
         0:
         hl.Struct(locus=hl.Locus('1', 1), alleles=['A', 'T', 'C']),
         1:
         hl.Struct(locus=hl.Locus('2', 1), alleles=['A', 'AT', '@']),
         2:
         hl.Struct(locus=hl.Locus('2', 1), alleles=['AC', 'GT'])
     })
     mt = mt.annotate_rows(**variants[mt.row_idx]).key_rows_by(
         'locus', 'alleles')
     r = hl.summarize_variants(mt, show=False)
     self.assertEqual(r.n_variants, 3)
     self.assertEqual(r.contigs, {'1': 1, '2': 2})
     self.assertEqual(r.allele_types, {
         'SNP': 2,
         'MNP': 1,
         'Unknown': 1,
         'Insertion': 1
     })
     self.assertEqual(r.allele_counts, {2: 1, 3: 2})
예제 #12
0
    def test_import_vcf_missing_format_field_elements(self):
        mt = hl.import_vcf(resource('missingFormatArray.vcf'),
                           reference_genome='GRCh37',
                           array_elements_required=False)
        mt = mt.select_rows().select_entries('AD', 'PL')

        expected = hl.Table.parallelize([{
            'locus': hl.Locus('X', 16050036),
            'alleles': ['A', 'C'],
            's': 'C1046::HG02024',
            'AD': [None, None],
            'PL': [0, None, 180]
        }, {
            'locus': hl.Locus('X', 16050036),
            'alleles': ['A', 'C'],
            's': 'C1046::HG02025',
            'AD': [None, 6],
            'PL': [70, None]
        }, {
            'locus': hl.Locus('X', 16061250),
            'alleles': ['T', 'A', 'C'],
            's': 'C1046::HG02024',
            'AD': [0, 0, None],
            'PL': [396, None, None, 33, None, 0]
        }, {
            'locus': hl.Locus('X', 16061250),
            'alleles': ['T', 'A', 'C'],
            's': 'C1046::HG02025',
            'AD': [0, 0, 9],
            'PL': [None, None, None]
        }],
                                        hl.tstruct(locus=hl.tlocus('GRCh37'),
                                                   alleles=hl.tarray(hl.tstr),
                                                   s=hl.tstr,
                                                   AD=hl.tarray(hl.tint),
                                                   PL=hl.tarray(hl.tint)),
                                        key=['locus', 'alleles', 's'])

        self.assertTrue(mt.entries()._same(expected))
예제 #13
0
 def test_import_vcf_missing_info_field_elements(self):
     mt = hl.import_vcf(resource('missingInfoArray.vcf'),
                        reference_genome='GRCh37',
                        array_elements_required=False)
     mt = mt.select_rows(FOO=mt.info.FOO, BAR=mt.info.BAR)
     expected = hl.Table.parallelize([{
         'locus': hl.Locus('X', 16050036),
         'alleles': ['A', 'C'],
         'FOO': [1, None],
         'BAR': [2, None, None]
     }, {
         'locus': hl.Locus('X', 16061250),
         'alleles': ['T', 'A', 'C'],
         'FOO': [None, 2, None],
         'BAR': [None, 1.0, None]
     }],
                                     hl.tstruct(locus=hl.tlocus('GRCh37'),
                                                alleles=hl.tarray(hl.tstr),
                                                FOO=hl.tarray(hl.tint),
                                                BAR=hl.tarray(hl.tfloat64)),
                                     key=['locus', 'alleles'])
     self.assertTrue(mt.rows()._same(expected))
예제 #14
0
def calculate_new_intervals(ht, n, reference_genome):
    """takes a table, keyed by ['locus', ...] and produces a list of intervals suitable
    for repartitioning a combiner matrix table

    Parameters
    ----------
    ht : :class:`.Table`
        Table / Rows Table to compute new intervals for
    n : :obj:`int`
        Number of rows each partition should have, (last partition may be smaller)
    reference_genome: :obj:`str` or :class:`.ReferenceGenome`, optional
        Reference genome to use.

    Returns
    -------
    :obj:`List[Interval]`
    """
    assert list(ht.key) == ['locus']
    assert ht.locus.dtype == hl.tlocus(reference_genome=reference_genome)
    end = hl.Locus(reference_genome.contigs[-1],
                   reference_genome.lengths[reference_genome.contigs[-1]],
                   reference_genome=reference_genome)

    n_rows = ht.count()

    if n_rows == 0:
        raise ValueError('empty table!')

    ht = ht.select()
    ht = ht.annotate(x=hl.scan.count())
    ht = ht.annotate(y=ht.x + 1)
    ht = ht.filter((ht.x // n != ht.y // n) | (ht.x == (n_rows - 1)))
    ht = ht.select()
    ht = ht.annotate(start=hl.or_else(
        hl.scan._prev_nonnull(
            hl.locus_from_global_position(ht.locus.global_position() + 1,
                                          reference_genome=reference_genome)),
        hl.locus_from_global_position(0, reference_genome=reference_genome)))
    ht = ht.key_by()
    ht = ht.select(
        interval=hl.interval(start=ht.start, end=ht.locus, includes_end=True))

    intervals = ht.aggregate(hl.agg.collect(ht.interval))

    last_st = hl.eval(
        hl.locus_from_global_position(
            hl.literal(intervals[-1].end).global_position() + 1,
            reference_genome=reference_genome))
    interval = hl.Interval(start=last_st, end=end, includes_end=True)
    intervals.append(interval)
    return intervals
예제 #15
0
 def values(self):
     values = [(hl.tbool, True), (hl.tint32, 0), (hl.tint64, 0),
               (hl.tfloat32, 0.5), (hl.tfloat64, 0.5), (hl.tstr, "foo"),
               (hl.tstruct(x=hl.tint32), hl.Struct(x=0)),
               (hl.tarray(hl.tint32), [0, 1, 4]),
               (hl.tset(hl.tint32), {0, 1, 4}),
               (hl.tdict(hl.tstr, hl.tint32), {
                   "a": 0,
                   "b": 1,
                   "c": 4
               }), (hl.tinterval(hl.tint32), hl.Interval(0, 1, True,
                                                         False)),
               (hl.tlocus(hl.default_reference()), hl.Locus("1", 1)),
               (hl.tcall, hl.Call([0, 1]))]
     return values
예제 #16
0
    def test_json_encoder(self):
        self.assertEqual(
            json.dumps(frozendict({"foo": "bar"}), cls=hl.utils.JSONEncoder),
            '{"foo": "bar"}')

        self.assertEqual(
            json.dumps(Struct(foo="bar"), cls=hl.utils.JSONEncoder),
            '{"foo": "bar"}')

        self.assertEqual(
            json.dumps(Interval(start=1, end=10), cls=hl.utils.JSONEncoder),
            '{"start": 1, "end": 10, "includes_start": true, "includes_end": false}'
        )

        self.assertEqual(
            json.dumps(hl.Locus(1, 100, "GRCh38"), cls=hl.utils.JSONEncoder),
            '{"contig": "1", "position": 100, "reference_genome": "GRCh38"}')
예제 #17
0
def test_impute_sex_chromosome_ploidy():
    x_par_end = 2699521
    y_par_end = 2649521
    rg = hl.get_reference('GRCh37')
    ref_blocks = [
        hl.Struct(s='sample_xx',
                  ref_allele='A',
                  locus=hl.Locus('22', 1000000, rg),
                  END=2000000,
                  GQ=15,
                  DP=5),
        hl.Struct(s='sample_xx',
                  ref_allele='A',
                  locus=hl.Locus('X', x_par_end - 10, rg),
                  END=x_par_end + 9,
                  GQ=18,
                  DP=6),
        hl.Struct(s='sample_xx',
                  ref_allele='A',
                  locus=hl.Locus('X', x_par_end + 10, rg),
                  END=x_par_end + 29,
                  GQ=15,
                  DP=5),
        hl.Struct(s='sample_xy',
                  ref_allele='A',
                  locus=hl.Locus('22', 1000000, rg),
                  END=2000000,
                  GQ=15,
                  DP=5),
        hl.Struct(s='sample_xy',
                  ref_allele='A',
                  locus=hl.Locus('X', x_par_end - 10, rg),
                  END=x_par_end + 9,
                  GQ=9,
                  DP=3),
        hl.Struct(s='sample_xy',
                  ref_allele='A',
                  locus=hl.Locus('X', x_par_end + 10, rg),
                  END=x_par_end + 29,
                  GQ=6,
                  DP=2),
        hl.Struct(s='sample_xy',
                  ref_allele='A',
                  locus=hl.Locus('Y', y_par_end - 10, rg),
                  END=y_par_end + 9,
                  GQ=12,
                  DP=4),
        hl.Struct(s='sample_xy',
                  ref_allele='A',
                  locus=hl.Locus('Y', y_par_end + 10, rg),
                  END=y_par_end + 29,
                  GQ=9,
                  DP=3),
    ]

    ref_mt = hl.Table.parallelize(ref_blocks,
                                  schema=hl.dtype('struct{s:str,locus:locus<GRCh37>,ref_allele:str,END:int32,GQ:int32,DP:int32}')) \
        .to_matrix_table(row_key=['locus'], row_fields=['ref_allele'], col_key=['s'])
    var_mt = hl.Table.parallelize([],
                                  schema=hl.dtype('struct{locus:locus<GRCh37>,alleles:array<str>,s:str,LA:array<int32>,LGT:call,GQ:int32}'))\
    .to_matrix_table(row_key=['locus', 'alleles'], col_key=['s'])

    vds = hl.vds.VariantDataset(ref_mt, var_mt)

    calling_intervals = [
        hl.parse_locus_interval('22:1000010-1000020',
                                reference_genome='GRCh37'),
        hl.parse_locus_interval(f'X:{x_par_end}-{x_par_end+20}',
                                reference_genome='GRCh37'),
        hl.parse_locus_interval(f'Y:{y_par_end}-{y_par_end+20}',
                                reference_genome='GRCh37'),
    ]

    r = hl.vds.impute_sex_chromosome_ploidy(vds,
                                            calling_intervals,
                                            normalization_contig='22')

    assert r.collect() == [
        hl.Struct(s='sample_xx',
                  autosomal_mean_dp=5.0,
                  x_mean_dp=5.5,
                  x_ploidy=2.2,
                  y_mean_dp=0.0,
                  y_ploidy=0.0),
        hl.Struct(s='sample_xy',
                  autosomal_mean_dp=5.0,
                  x_mean_dp=2.5,
                  x_ploidy=1.0,
                  y_mean_dp=3.5,
                  y_ploidy=1.4)
    ]
예제 #18
0
    def test_import_bgen_variant_filtering_from_literals(self):
        bgen_file = resource('example.8bits.bgen')

        hl.index_bgen(bgen_file, contig_recoding={'01': '1'})

        alleles = ['A', 'G']

        desired_variants = [
            hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles),
        ]

        expected_result = [
            hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000),
                      alleles=alleles),  # Duplicated variant
            hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles),
        ]

        part_1 = hl.import_bgen(
            bgen_file,
            ['GT'],
            n_partitions=1,  # forcing seek to be called
            variants=desired_variants)
        self.assertTrue(part_1.rows().key_by(
            'locus', 'alleles').select().collect() == expected_result)

        part_199 = hl.import_bgen(
            bgen_file,
            ['GT'],
            n_partitions=
            199,  # forcing each variant to be its own partition for testing duplicates work properly
            variants=desired_variants)
        self.assertTrue(part_199.rows().key_by(
            'locus', 'alleles').select().collect() == expected_result)

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (199, 500))

        expected = everything.filter_rows(
            hl.set(desired_variants).contains(everything.row_key))

        self.assertTrue(expected._same(part_1))
예제 #19
0
    def test_mendel_errors(self):
        mt = hl.import_vcf(resource('mendel.vcf'))
        ped = hl.Pedigree.read(resource('mendel.fam'))

        men, fam, ind, var = hl.mendel_errors(mt['GT'], ped)

        self.assertEqual(men.key.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   s=hl.tstr))
        self.assertEqual(men.row.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   s=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   mendel_code=hl.tint))
        self.assertEqual(fam.key.dtype, hl.tstruct(pat_id=hl.tstr,
                                                   mat_id=hl.tstr))
        self.assertEqual(fam.row.dtype, hl.tstruct(pat_id=hl.tstr,
                                                   mat_id=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   children=hl.tint,
                                                   errors=hl.tint64,
                                                   snp_errors=hl.tint64))
        self.assertEqual(ind.key.dtype, hl.tstruct(s=hl.tstr))
        self.assertEqual(ind.row.dtype, hl.tstruct(s=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   errors=hl.tint64,
                                                   snp_errors=hl.tint64))
        self.assertEqual(var.key.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr)))
        self.assertEqual(var.row.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   errors=hl.tint64))

        self.assertEqual(men.count(), 41)
        self.assertEqual(fam.count(), 2)
        self.assertEqual(ind.count(), 7)
        self.assertEqual(var.count(), mt.count_rows())

        self.assertEqual(set(fam.select('errors', 'snp_errors').collect()),
                         {
                             hl.utils.Struct(pat_id='Dad1', mat_id='Mom1', errors=41, snp_errors=39),
                             hl.utils.Struct(pat_id='Dad2', mat_id='Mom2', errors=0, snp_errors=0)
                         })

        self.assertEqual(set(ind.select('errors', 'snp_errors').collect()),
                         {
                             hl.utils.Struct(s='Son1', errors=23, snp_errors=22),
                             hl.utils.Struct(s='Dtr1', errors=18, snp_errors=17),
                             hl.utils.Struct(s='Dad1', errors=19, snp_errors=18),
                             hl.utils.Struct(s='Mom1', errors=22, snp_errors=21),
                             hl.utils.Struct(s='Dad2', errors=0, snp_errors=0),
                             hl.utils.Struct(s='Mom2', errors=0, snp_errors=0),
                             hl.utils.Struct(s='Son2', errors=0, snp_errors=0)
                         })

        to_keep = hl.set([
            (hl.Locus("1", 1), ['C', 'CT']),
            (hl.Locus("1", 2), ['C', 'T']),
            (hl.Locus("X", 1), ['C', 'T']),
            (hl.Locus("X", 3), ['C', 'T']),
            (hl.Locus("Y", 1), ['C', 'T']),
            (hl.Locus("Y", 3), ['C', 'T'])
        ])
        self.assertEqual(var.filter(to_keep.contains((var.locus, var.alleles)))
                         .order_by('locus')
                         .select('locus', 'alleles', 'errors').collect(),
                         [
                             hl.utils.Struct(locus=hl.Locus("1", 1), alleles=['C', 'CT'], errors=2),
                             hl.utils.Struct(locus=hl.Locus("1", 2), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("X", 1), alleles=['C', 'T'], errors=2),
                             hl.utils.Struct(locus=hl.Locus("X", 3), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("Y", 1), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("Y", 3), alleles=['C', 'T'], errors=1),
                         ])

        ped2 = hl.Pedigree.read(resource('mendelWithMissingSex.fam'))
        men2, _, _, _ = hl.mendel_errors(mt['GT'], ped2)

        self.assertTrue(men2.filter(men2.s == 'Dtr1')._same(men.filter(men.s == 'Dtr1')))
예제 #20
0
파일: test_qc.py 프로젝트: tuyanglin/hail
    def test_concordance_n_discordant(self):
        dataset = get_dataset()
        _, cols_conc, rows_conc = hl.concordance(dataset, dataset)
        assert cols_conc.aggregate(
            hl.agg.count_where(cols_conc.n_discordant != 0)) == 0

        rows1 = [
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '1',
                    'GT': hl.Call([0, 0])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '2',
                    'GT': hl.Call([0, 0])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '3',
                    'GT': hl.Call([1, 1])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '4',
                    'GT': hl.Call([1, 1])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 101),
                    'alleles': ['A', 'T'],
                    's': '1',
                    'GT': hl.Call([1, 1])
                }),
        ]
        rows2 = [
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '1',
                    'GT': None
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '2',
                    'GT': hl.Call([0, 1])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '3',
                    'GT': hl.Call([0, 1])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '4',
                    'GT': hl.Call([1, 1])
                }),
        ]

        def make_mt(rows):
            ht = hl.Table.parallelize(
                rows,
                schema=
                'struct{locus:locus<GRCh37>,alleles:array<str>,s:str,GT:call}')
            return ht.to_matrix_table(row_key=['locus', 'alleles'],
                                      col_key=['s'])

        global_conc_2, cols_conc_2, rows_conc_2 = hl.concordance(
            make_mt(rows1), make_mt(rows2))
        assert cols_conc_2.collect() == [
            hl.Struct(s='1',
                      concordance=[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 1, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [1, 0, 0, 0, 0]],
                      n_discordant=0),
            hl.Struct(s='2',
                      concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 1, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 0]],
                      n_discordant=1),
            hl.Struct(s='3',
                      concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 1, 0]],
                      n_discordant=1),
            hl.Struct(s='4',
                      concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 1]],
                      n_discordant=0),
        ]

        assert global_conc_2 == [[3, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                 [0, 1, 0, 1, 0], [0, 0, 0, 0, 0],
                                 [1, 0, 0, 1, 1]]
        assert rows_conc_2.collect() == [
            hl.Struct(locus=hl.Locus('1', 100),
                      alleles=['A', 'T'],
                      concordance=[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 1, 0, 1, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 1, 1]],
                      n_discordant=2),
            hl.Struct(locus=hl.Locus('1', 101),
                      alleles=['A', 'T'],
                      concordance=[[3, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [1, 0, 0, 0, 0]],
                      n_discordant=0),
        ]
예제 #21
0
    def test_locus_windows(self):
        def assert_eq(a, b):
            self.assertTrue(np.array_equal(a, np.array(b)))

        centimorgans = hl.literal([0.1, 1.0, 1.0, 1.5, 1.9])

        mt = hl.balding_nichols_model(1, 5, 5).add_row_index()
        mt = mt.annotate_rows(cm=centimorgans[hl.int32(mt.row_idx)]).cache()

        starts, stops = hl.linalg.utils.locus_windows(mt.locus, 2)
        assert_eq(starts, [0, 0, 0, 1, 2])
        assert_eq(stops, [3, 4, 5, 5, 5])

        starts, stops = hl.linalg.utils.locus_windows(mt.locus,
                                                      0.5,
                                                      coord_expr=mt.cm)
        assert_eq(starts, [0, 1, 1, 1, 3])
        assert_eq(stops, [1, 4, 4, 5, 5])

        starts, stops = hl.linalg.utils.locus_windows(
            mt.locus, 1.0, coord_expr=2 * centimorgans[hl.int32(mt.row_idx)])
        assert_eq(starts, [0, 1, 1, 1, 3])
        assert_eq(stops, [1, 4, 4, 5, 5])

        rows = [{
            'locus': hl.Locus('1', 1),
            'cm': 1.0
        }, {
            'locus': hl.Locus('1', 2),
            'cm': 3.0
        }, {
            'locus': hl.Locus('1', 4),
            'cm': 4.0
        }, {
            'locus': hl.Locus('2', 1),
            'cm': 2.0
        }, {
            'locus': hl.Locus('2', 1),
            'cm': 2.0
        }, {
            'locus': hl.Locus('3', 3),
            'cm': 5.0
        }]

        ht = hl.Table.parallelize(rows,
                                  hl.tstruct(locus=hl.tlocus('GRCh37'),
                                             cm=hl.tfloat64),
                                  key=['locus'])

        starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1)
        assert_eq(starts, [0, 0, 2, 3, 3, 5])
        assert_eq(stops, [2, 2, 3, 5, 5, 6])

        starts, stops = hl.linalg.utils.locus_windows(ht.locus,
                                                      1.0,
                                                      coord_expr=ht.cm)
        assert_eq(starts, [0, 1, 1, 3, 3, 5])
        assert_eq(stops, [1, 3, 3, 5, 5, 6])

        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.order_by(ht.cm).locus, 1.0)
        self.assertTrue('ascending order' in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(
                ht.locus, 1.0, coord_expr=hl.utils.range_table(1).idx)
        self.assertTrue('different source' in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(hl.locus('1', 1), 1.0)
        self.assertTrue("no source" in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=0.0)
        self.assertTrue("no source" in str(cm.exception))

        ht = ht.annotate_globals(x=hl.locus('1', 1), y=1.0)
        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.x, 1.0)
        self.assertTrue("row-indexed" in str(cm.exception))
        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, ht.y)
        self.assertTrue("row-indexed" in str(cm.exception))

        ht = hl.Table.parallelize([{
            'locus': hl.null(hl.tlocus()),
            'cm': 1.0
        }],
                                  hl.tstruct(locus=hl.tlocus('GRCh37'),
                                             cm=hl.tfloat64),
                                  key=['locus'])
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0)
        self.assertTrue("missing value for 'locus_expr'" in str(cm.exception))
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        self.assertTrue("missing value for 'locus_expr'" in str(cm.exception))

        ht = hl.Table.parallelize([{
            'locus': hl.Locus('1', 1),
            'cm': hl.null(hl.tfloat64)
        }],
                                  hl.tstruct(locus=hl.tlocus('GRCh37'),
                                             cm=hl.tfloat64),
                                  key=['locus'])
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        self.assertTrue("missing value for 'coord_expr'" in str(cm.exception))