Пример #1
0
    def test_liftover_strand(self):
        grch37 = hl.get_reference('GRCh37')
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'),
                            'GRCh38')

        self.assertEqual(
            hl.eval(
                hl.liftover(hl.locus('20', 60001, 'GRCh37'),
                            'GRCh38',
                            include_strand=True)),
            hl.eval(
                hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'),
                          is_negative_strand=False)))

        self.assertEqual(
            hl.eval(
                hl.liftover(hl.locus_interval('20', 37007582, 37007586, True,
                                              True, 'GRCh37'),
                            'GRCh38',
                            include_strand=True)),
            hl.eval(
                hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121,
                                                   True, True, 'GRCh38'),
                          is_negative_strand=True)))

        with self.assertRaises(FatalError):
            hl.eval(
                hl.liftover(
                    hl.parse_locus_interval('1:10000-10000',
                                            reference_genome='GRCh37'),
                    'GRCh38'))

        grch37.remove_liftover("GRCh38")
Пример #2
0
    def test_reference_genome_liftover(self):
        grch37 = hl.get_reference('GRCh37')
        grch38 = hl.get_reference('GRCh38')

        self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37'))
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38')
        grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37')
        assert grch37.has_liftover('GRCh38')
        assert grch38.has_liftover('GRCh37')

        ds = hl.import_vcf(resource('sample.vcf'))
        t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows()
        assert t.all(t.locus == t.liftover)

        null_locus = hl.null(hl.tlocus('GRCh38'))

        rows = [
            {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')},
            {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')},
            {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')},
            {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')},
            {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')},
            {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')},
            {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus}
        ]
        schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38),
                                      hl.liftover(t.l37, 'GRCh38') == t.l38,
                                      hl.is_missing(hl.liftover(t.l37, 'GRCh38')))))

        t = t.filter(hl.is_defined(t.l38))
        self.assertTrue(t.count() == 6)

        t = t.key_by('l38')
        t.count()
        self.assertTrue(list(t.key) == ['l38'])

        null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38')))
        rows = [
            {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval},
            {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'),
             'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')}
        ]
        schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38)))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38))

        grch37.remove_liftover("GRCh38")
        grch38.remove_liftover("GRCh37")
Пример #3
0
    def test_reference_genome_liftover(self):
        grch37 = hl.get_reference('GRCh37')
        grch38 = hl.get_reference('GRCh38')

        self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37'))
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38')
        grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37')
        self.assertTrue(grch37.has_liftover('GRCh38') and grch38.has_liftover('GRCh37'))

        ds = hl.import_vcf(resource('sample.vcf'))
        t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows()
        self.assertTrue(t.all(t.locus == t.liftover))

        null_locus = hl.null(hl.tlocus('GRCh38'))

        rows = [
            {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')},
            {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')},
            {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')},
            {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')},
            {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')},
            {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')},
            {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus}
        ]
        schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38),
                                      hl.liftover(t.l37, 'GRCh38') == t.l38,
                                      hl.is_missing(hl.liftover(t.l37, 'GRCh38')))))

        t = t.filter(hl.is_defined(t.l38))
        self.assertTrue(t.count() == 6)

        t = t.key_by('l38')
        t.count()
        self.assertTrue(list(t.key) == ['l38'])

        null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38')))
        rows = [
            {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval},
            {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'),
             'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')}
        ]
        schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38)))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38))

        grch37.remove_liftover("GRCh38")
        grch38.remove_liftover("GRCh37")
Пример #4
0
    def test_liftover_strand(self):
        grch37 = hl.get_reference('GRCh37')
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38')

        self.assertEqual(hl.eval(hl.liftover(hl.locus('20', 60001, 'GRCh37'), 'GRCh38', include_strand=True)),
                         hl.eval(hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'), is_negative_strand=False)))

        self.assertEqual(hl.eval(hl.liftover(hl.locus_interval('20', 37007582, 37007586, True, True, 'GRCh37'),
                                             'GRCh38', include_strand=True)),
                         hl.eval(hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121, True, True, 'GRCh38'),
                                           is_negative_strand=True)))

        grch37.remove_liftover("GRCh38")
Пример #5
0
def densify_sites(
    mt: hl.MatrixTable,
    sites_ht: hl.Table,
    last_END_positions_ht: hl.Table,
    semi_join_rows: bool = True,
) -> hl.MatrixTable:
    """
    Creates a dense version of the input sparse MT at the sites in `sites_ht` reading the minimal amount of data required.

    Note that only rows that appear both in `mt` and `sites_ht` are returned.

    :param mt: Input sparse MT
    :param sites_ht: Desired sites to densify
    :param last_END_positions_ht: Table storing positions of the furthest ref block (END tag)
    :param semi_join_rows: Whether to filter the MT rows based on semi-join (default, better if sites_ht is large) or based on filter_intervals (better if sites_ht only contains a few sites)
    :return: Dense MT filtered to the sites in `sites_ht`
    """
    logger.info("Computing intervals to densify from sites Table.")
    sites_ht = sites_ht.key_by("locus")
    sites_ht = sites_ht.annotate(
        interval=hl.locus_interval(
            sites_ht.locus.contig,
            last_END_positions_ht[sites_ht.key].last_END_position,
            end=sites_ht.locus.position,
            includes_end=True,
            reference_genome=sites_ht.locus.dtype.reference_genome,
        )
    )
    sites_ht = sites_ht.filter(hl.is_defined(sites_ht.interval))

    if semi_join_rows:
        mt = mt.filter_rows(hl.is_defined(sites_ht.key_by("interval")[mt.locus]))
    else:
        logger.info("Collecting intervals to densify.")
        intervals = sites_ht.interval.collect()

        print(
            "Found {0} intervals, totalling {1} bp in the dense Matrix.".format(
                len(intervals),
                sum(
                    [
                        interval_length(interval)
                        for interval in union_intervals(intervals)
                    ]
                ),
            )
        )

        mt = hl.filter_intervals(mt, intervals)

    mt = hl.experimental.densify(mt)

    return mt.filter_rows(hl.is_defined(sites_ht[mt.locus]))
Пример #6
0
    def test_import_locus_intervals(self):
        interval_file = resource('annotinterall.interval_list')
        t = hl.import_locus_intervals(interval_file, reference_genome='GRCh37')
        nint = t.count()

        i = 0
        with open(interval_file) as f:
            for line in f:
                if len(line.strip()) != 0:
                    i += 1
        self.assertEqual(nint, i)
        self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37'))

        tmp_file = new_temp_file(prefix="test", suffix="interval_list")
        start = t.interval.start
        end = t.interval.end
        (t.key_by(
            interval=hl.locus_interval(start.contig, start.position, end.
                                       position, True, True)).select().export(
                                           tmp_file, header=False))

        t2 = hl.import_locus_intervals(tmp_file)

        self.assertTrue(t.select()._same(t2))
Пример #7
0
    def test_import_locus_intervals(self):
        interval_file = resource('annotinterall.interval_list')
        t = hl.import_locus_intervals(interval_file, reference_genome='GRCh37')
        nint = t.count()

        i = 0
        with open(interval_file) as f:
            for line in f:
                if len(line.strip()) != 0:
                    i += 1
        self.assertEqual(nint, i)
        self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37'))

        tmp_file = new_temp_file(prefix="test", suffix="interval_list")
        start = t.interval.start
        end = t.interval.end
        (t
         .key_by(interval=hl.locus_interval(start.contig, start.position, end.position, True, True))
         .select()
         .export(tmp_file, header=False))

        t2 = hl.import_locus_intervals(tmp_file)

        self.assertTrue(t.select()._same(t2))
Пример #8
0
def test_segment_intervals():
    vds = hl.vds.read_vds(
        os.path.join(resource('vds'), '1kg_chr22_5_samples.vds'))

    contig_len = vds.reference_data.locus.dtype.reference_genome.lengths[
        'chr22']
    breakpoints = hl.literal([*range(1, contig_len, 5_000_000), contig_len])
    intervals = hl.range(hl.len(breakpoints) - 1) \
        .map(lambda i: hl.struct(
        interval=hl.locus_interval('chr22', breakpoints[i], breakpoints[i + 1], reference_genome='GRCh38')))
    intervals_ht = hl.Table.parallelize(intervals, key='interval')

    path = new_temp_file()
    r = hl.vds.segment_reference_blocks(vds.reference_data, intervals_ht)
    r.write(path)
    after = hl.read_matrix_table(path)

    es = after.entries()
    es = es.filter((es.END < es.locus.position)
                   | (es.END >= es.interval.end.position))
    if es.count() > 0:
        es.show(width=1000)
        assert False, "found entries with END < position or END >= interval end"

    before = vds.reference_data

    sum_per_sample_before = before.select_cols(
        ref_block_bases=hl.agg.sum(before.END + 1 -
                                   before.locus.position)).cols()
    sum_per_sample_after = after.select_cols(
        ref_block_bases=hl.agg.sum(after.END + 1 -
                                   after.locus.position)).cols()

    before_coverage = sum_per_sample_before.collect()
    after_coverage = sum_per_sample_after.collect()
    assert before_coverage == after_coverage
Пример #9
0
def liftover_intervals(t: hl.Table,
                       keep_missing_interval: bool = False) -> hl.Table:
    """
    Liftover locus in intervals from one coordinate system (hg37) to another (hg38)

    # Example input table description
    #
    # ----------------------------------------
    # Global fields:
    #     None
    # ----------------------------------------
    # Row fields:
    #     'interval': interval<locus<GRCh37>>
    # ----------------------------------------
    # Key: ['interval']
    # ----------------------------------------


    :param t: Table of intervals on GRCh37
    :param keep_missing_interval: If True, keep missing (non-lifted) intervals in the output Table.
    :return: Table with intervals lifted over GRCh38 added.
    """

    rg37 = hl.get_reference("GRCh37")
    rg38 = hl.get_reference("GRCh38")

    if not rg37.has_liftover("GRCh38"):
        rg37.add_liftover(
            f'{nfs_dir}/resources/liftover/grch37_to_grch38.over.chain.gz',
            rg38)

    t = t.annotate(
        start=hl.liftover(t.interval.start, "GRCh38"),
        end=hl.liftover(t.interval.end, "GRCh38"),
    )

    t = t.filter((t.start.contig == "chr" + t.interval.start.contig)
                 & (t.end.contig == "chr" + t.interval.end.contig))

    t = t.key_by()

    t = (t.select(interval=hl.locus_interval(t.start.contig,
                                             t.start.position,
                                             t.end.position,
                                             reference_genome=rg38,
                                             invalid_missing=True),
                  interval_hg37=t.interval))

    # bad intervals
    missing = t.aggregate(hl.agg.counter(~hl.is_defined(t.interval)))
    logger.info(
        f"Number of missing intervals: {missing[True]} out of {t.count()}...")

    # update globals annotations
    global_ann_expr = {
        'date': current_date(),
        'reference_genome': 'GRCh38',
        'was_lifted': True
    }
    t = t.annotate_globals(**global_ann_expr)

    if not keep_missing_interval:
        logger.info(f"Filtering out {missing[True]} missing intervals...")
        t = t.filter(hl.is_defined(t.interval), keep=True)

    return t.key_by("interval")
Пример #10
0
def import_gtf(path,
               reference_genome=None,
               skip_invalid_contigs=False,
               min_partitions=None) -> hl.Table:
    """Import a GTF file.

       The GTF file format is identical to the GFF version 2 file format,
       and so this function can be used to import GFF version 2 files as
       well.

       See https://www.ensembl.org/info/website/upload/gff.html for more
       details on the GTF/GFF2 file format.

       The :class:`.Table` returned by this function will be keyed by the
       ``interval`` row field and will include the following row fields:

       .. code-block:: text

           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'interval': interval<>

       There will also be corresponding fields for every tag found in the
       attribute field of the GTF file.

       Note
       ----

       This function will return an ``interval`` field of type :class:`.tinterval`
       constructed from the ``seqname``, ``start``, and ``end`` fields in the
       GTF file. This interval is inclusive of both the start and end positions
       in the GTF file. 

       If the ``reference_genome`` parameter is specified, the start and end
       points of the ``interval`` field will be of type :class:`.tlocus`.
       Otherwise, the start and end points of the ``interval`` field will be of
       type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and
       ``position`` (type :class:`.tint32`).

       Furthermore, if the ``reference_genome`` parameter is specified and
       ``skip_invalid_contigs`` is ``True``, this import function will skip
       lines in the GTF where ``seqname`` is not consistent with the reference
       genome specified.

       Example
       -------

       >>> ht = hl.experimental.import_gtf('data/test.gtf', 
       ...                                 reference_genome='GRCh37',
       ...                                 skip_invalid_contigs=True)

       >>> ht.describe()  # doctest: +SKIP_OUTPUT_CHECK
       ----------------------------------------
       Global fields:
       None
       ----------------------------------------
       Row fields:
           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'gene_type': str
           'exon_id': str
           'havana_transcript': str
           'level': str
           'transcript_name': str
           'gene_status': str
           'gene_id': str
           'transcript_type': str
           'tag': str
           'transcript_status': str
           'gene_name': str
           'transcript_id': str
           'exon_number': str
           'havana_gene': str
           'interval': interval<locus<GRCh37>>
       ----------------------------------------
       Key: ['interval']
       ----------------------------------------

       Parameters
       ----------

       path : :obj:`str`
           File to import.
       reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional
           Reference genome to use.
       skip_invalid_contigs : :obj:`bool`
           If ``True`` and `reference_genome` is not ``None``, skip lines where
           ``seqname`` is not consistent with the reference genome.
       min_partitions : :obj:`int` or :obj:`None`
           Minimum number of partitions (passed to import_table).

       Returns
       -------
       :class:`.Table`
       """

    ht = hl.import_table(path,
                         min_partitions=min_partitions,
                         comment='#',
                         no_header=True,
                         types={
                             'f3': hl.tint,
                             'f4': hl.tint,
                             'f5': hl.tfloat,
                             'f7': hl.tint
                         },
                         missing='.',
                         delimiter='\t')

    ht = ht.rename({
        'f0': 'seqname',
        'f1': 'source',
        'f2': 'feature',
        'f3': 'start',
        'f4': 'end',
        'f5': 'score',
        'f6': 'strand',
        'f7': 'frame',
        'f8': 'attribute'
    })

    ht = ht.annotate(attribute=hl.dict(
        hl.map(
            lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').
                       replace(';$', '')), ht['attribute'].split('; '))))

    attributes = ht.aggregate(
        hl.agg.explode(lambda x: hl.agg.collect_as_set(x),
                       ht['attribute'].keys()))

    ht = ht.transmute(
        **{
            x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x])
            for x in attributes if x
        })

    if reference_genome:
        if reference_genome == 'GRCh37':
            ht = ht.annotate(seqname=ht['seqname'].replace('^chr', ''))
        else:
            ht = ht.annotate(seqname=hl.case().when(
                ht['seqname'].startswith('HLA'), ht['seqname']).when(
                    ht['seqname'].startswith('chrHLA'), ht['seqname'].replace(
                        '^chr', '')).when(ht['seqname'].startswith(
                            'chr'), ht['seqname']).default('chr' +
                                                           ht['seqname']))
        if skip_invalid_contigs:
            valid_contigs = hl.literal(
                set(hl.get_reference(reference_genome).contigs))
            ht = ht.filter(valid_contigs.contains(ht['seqname']))
        ht = ht.transmute(
            interval=hl.locus_interval(ht['seqname'],
                                       ht['start'],
                                       ht['end'],
                                       includes_start=True,
                                       includes_end=True,
                                       reference_genome=reference_genome))
    else:
        ht = ht.transmute(interval=hl.interval(
            hl.struct(seqname=ht['seqname'], position=ht['start']),
            hl.struct(seqname=ht['seqname'], position=ht['end']),
            includes_start=True,
            includes_end=True))

    ht = ht.key_by('interval')

    return ht
Пример #11
0
def import_gtf(path,
               reference_genome=None,
               skip_invalid_contigs=False,
               min_partitions=None,
               force_bgz=False,
               force=False) -> hl.Table:
    """Import a GTF file.

       The GTF file format is identical to the GFF version 2 file format,
       and so this function can be used to import GFF version 2 files as
       well.

       See https://www.ensembl.org/info/website/upload/gff.html for more
       details on the GTF/GFF2 file format.

       The :class:`.Table` returned by this function will be keyed by the
       ``interval`` row field and will include the following row fields:

       .. code-block:: text

           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'interval': interval<>

       There will also be corresponding fields for every tag found in the
       attribute field of the GTF file.

       Note
       ----

       This function will return an ``interval`` field of type :class:`.tinterval`
       constructed from the ``seqname``, ``start``, and ``end`` fields in the
       GTF file. This interval is inclusive of both the start and end positions
       in the GTF file.

       If the ``reference_genome`` parameter is specified, the start and end
       points of the ``interval`` field will be of type :class:`.tlocus`.
       Otherwise, the start and end points of the ``interval`` field will be of
       type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and
       ``position`` (type :obj:`.tint32`).

       Furthermore, if the ``reference_genome`` parameter is specified and
       ``skip_invalid_contigs`` is ``True``, this import function will skip
       lines in the GTF where ``seqname`` is not consistent with the reference
       genome specified.

       Example
       -------

       >>> ht = hl.experimental.import_gtf('data/test.gtf',
       ...                                 reference_genome='GRCh37',
       ...                                 skip_invalid_contigs=True)

       >>> ht.describe()  # doctest: +SKIP_OUTPUT_CHECK
       ----------------------------------------
       Global fields:
       None
       ----------------------------------------
       Row fields:
           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'gene_type': str
           'exon_id': str
           'havana_transcript': str
           'level': str
           'transcript_name': str
           'gene_status': str
           'gene_id': str
           'transcript_type': str
           'tag': str
           'transcript_status': str
           'gene_name': str
           'transcript_id': str
           'exon_number': str
           'havana_gene': str
           'interval': interval<locus<GRCh37>>
       ----------------------------------------
       Key: ['interval']
       ----------------------------------------

       Parameters
       ----------

       path : :class:`str`
           File to import.
       reference_genome : :class:`str` or :class:`.ReferenceGenome`, optional
           Reference genome to use.
       skip_invalid_contigs : :obj:`bool`
           If ``True`` and `reference_genome` is not ``None``, skip lines where
           ``seqname`` is not consistent with the reference genome.
       min_partitions : :obj:`int` or :obj:`None`
           Minimum number of partitions (passed to import_table).
       force_bgz : :obj:`bool`
           If ``True``, load files as blocked gzip files, assuming
           that they were actually compressed using the BGZ codec. This option is
           useful when the file extension is not ``'.bgz'``, but the file is
           blocked gzip, so that the file can be read in parallel and not on a
           single node.
       force : :obj:`bool`
           If ``True``, load gzipped files serially on one core. This should
           be used only when absolutely necessary, as processing time will be
           increased due to lack of parallelism.

       Returns
       -------
       :class:`.Table`
       """

    ht = hl.import_table(path,
                         min_partitions=min_partitions,
                         comment='#',
                         no_header=True,
                         types={
                             'f3': hl.tint,
                             'f4': hl.tint,
                             'f5': hl.tfloat,
                             'f7': hl.tint
                         },
                         missing='.',
                         delimiter='\t',
                         force_bgz=force_bgz,
                         force=force)

    ht = ht.rename({
        'f0': 'seqname',
        'f1': 'source',
        'f2': 'feature',
        'f3': 'start',
        'f4': 'end',
        'f5': 'score',
        'f6': 'strand',
        'f7': 'frame',
        'f8': 'attribute'
    })

    def parse_attributes(unparsed_attributes):
        def parse_attribute(attribute):
            key_and_value = attribute.split(' ')
            key = key_and_value[0]
            value = key_and_value[1]
            return (key, value.replace('"|;\\$', ''))

        return hl.dict(unparsed_attributes.split('; ').map(parse_attribute))

    ht = ht.annotate(attribute=parse_attributes(ht['attribute']))

    ht = ht.checkpoint(new_temp_file())

    attributes = ht.aggregate(
        hl.agg.explode(lambda x: hl.agg.collect_as_set(x),
                       ht['attribute'].keys()))

    ht = ht.transmute(
        **{
            x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x])
            for x in attributes if x
        })

    if reference_genome:
        if reference_genome.name == 'GRCh37':
            ht = ht.annotate(
                seqname=hl.case().when((ht['seqname'] == 'M')
                                       | (ht['seqname'] == 'chrM'), 'MT').
                when(ht['seqname'].startswith('chr'), ht['seqname'].replace(
                    '^chr', '')).default(ht['seqname']))
        else:
            ht = ht.annotate(seqname=hl.case().when(
                ht['seqname'].startswith('HLA'), ht['seqname']).when(
                    ht['seqname'].startswith('chrHLA'), ht['seqname'].replace(
                        '^chr', '')).when(ht['seqname'].startswith(
                            'chr'), ht['seqname']).default('chr' +
                                                           ht['seqname']))
        if skip_invalid_contigs:
            valid_contigs = hl.literal(set(reference_genome.contigs))
            ht = ht.filter(valid_contigs.contains(ht['seqname']))
        ht = ht.transmute(
            interval=hl.locus_interval(ht['seqname'],
                                       ht['start'],
                                       ht['end'],
                                       includes_start=True,
                                       includes_end=True,
                                       reference_genome=reference_genome))
    else:
        ht = ht.transmute(interval=hl.interval(
            hl.struct(seqname=ht['seqname'], position=ht['start']),
            hl.struct(seqname=ht['seqname'], position=ht['end']),
            includes_start=True,
            includes_end=True))

    ht = ht.key_by('interval')

    return ht
raw_data_root = 'gs://hail-datasets-raw-data/Ensembl'
hail_data_root = 'gs://hail-datasets-hail-data'

parser = argparse.ArgumentParser()
parser.add_argument('-v', required=True, help='Dataset version.')
parser.add_argument('-b', required=True, choices=['GRCh37', 'GRCh38'], help='Ensembl reference genome build.')
args = parser.parse_args()

name = 'Ensembl_homo_sapiens_low_complexity_regions'
version = args.v
build = args.b

ht = hl.import_table(f'{raw_data_root}/Ensembl_homo_sapiens_low_complexity_regions_release{version}_{build}.tsv.bgz')

if build == 'GRCh37':
    ht = ht.annotate(interval=hl.locus_interval(ht['chromosome'], hl.int(ht['start']), hl.int(ht['end']), reference_genome='GRCh37'))
else:
    ht = ht.annotate(interval=hl.locus_interval('chr' + ht['chromosome'].replace('MT', 'M'), hl.int(ht['start']), hl.int(ht['end']), reference_genome='GRCh38'))

ht = ht.key_by('interval')
ht = ht.select()

n_rows = ht.count()
n_partitions = ht.n_partitions()

ht = ht.annotate_globals(metadata=hl.struct(name=name,
                                            version=f'release_{version}',
                                            reference_genome=build,
                                            n_rows=n_rows,
                                            n_partitions=n_partitions))
Пример #13
0
    ht_genes = ht_genes.rename({'interval': 'gene_interval'})
    ht_genes = ht_genes.distinct()

    mt = hl.import_matrix_table(
        EXTRACT_BUCKET + 'GTEx/v7/GTEx_junction_read_counts.v7.GRCh37.tsv.bgz',
        row_fields={
            'junction_id': hl.tstr,
            'Description': hl.tstr
        },
        missing=' ',
        entry_type=hl.tfloat)
    mt = mt.transmute_rows(chr_start_end=mt['junction_id'].split('_'))
    mt = mt.transmute_rows(
        junction_interval=hl.locus_interval(mt['chr_start_end'][0],
                                            hl.int(mt['chr_start_end'][1]),
                                            hl.int(mt['chr_start_end'][2]),
                                            includes_start=True,
                                            includes_end=True,
                                            reference_genome='GRCh37'))
    mt = mt.key_rows_by(mt['junction_interval'])
    mt = mt.transmute_entries(read_count=hl.int(mt['x']))
    mt = mt.rename({'Description': 'gene_id', 'col_id': 'sample_id'})
    mt = mt.annotate_cols(**ht_sample_attributes[mt.sample_id])

    if reference_genome == 'GRCh38':
        b37 = hl.get_reference('GRCh37')
        b37.add_liftover(
            'gs://hail-common/references/grch37_to_grch38.over.chain.gz',
            'GRCh38')
        mt = mt.key_rows_by()
        mt = mt.annotate_rows(
            junction_interval=hl.liftover(mt['junction_interval'], 'GRCh38'))
Пример #14
0
def main(args):
    if args.create_gene_sample_mt:
        mt = hl.read_matrix_table(
            'gs://gnomad/projects/compound_hets/myoseq/MacArthur_LGMD_Callset_Jan2019.mt'
        )
        meta = hl.read_table(
            'gs://gnomad/projects/compound_hets/myoseq/sample_qc/MacArthur_LGMD_Callset_Jan2019.full_meta.ht'
        )
        pop_distance = hl.read_table(
            'gs://gnomad-lfran/compound_hets/myoseq/sample_qc/myoseq_pop_distance_to_max_kde.ht'
        )
        variant_annotations_ht = hl.read_table(
            'gs://gnomad/projects/compound_hets/myoseq/MacArthur_LGMD_Callset_Jan2019.annotations.ht'
        )
        variant_annotations_ht.drop('was_split', 'a_index')
        mt = mt.annotate_cols(
            **meta[mt.col_key],
            **pop_distance[mt.col_key],
        )
        mt = mt.annotate_rows(**variant_annotations_ht[mt.row_key])

        # Filter samples failing QC
        mt = mt.filter_cols((hl.len(mt.sample_filters) == 0) & (
            mt.distance < args.pop_distance
        )  # NFE pop-distance away from densest point in KDE in pc-space (selects only NFEs)
                            )
        counts = mt.aggregate_cols(hl.agg.counter(mt.is_case))
        print(
            f'Found {counts[True]} cases and {counts[False]} controls for gene aggregation.'
        )

        # Filter sites failing QC, without any tx_annotation (i.e. without a protein-coding variant) or too common
        mt = mt.filter_rows(
            (hl.len(mt.filters) == 0) & hl.is_defined(mt.tx_annotation)
            & (hl.or_else(mt.gnomad_exomes_popmax.AF,
                          hl.or_else(mt.gnomad_genomes_popmax.AF, 0.0)) <
               args.max_gnomad_af))

        # Keep non-ref entries only
        entries_filter_expr = mt.GT.is_non_ref()
        if not args.raw:
            entries_filter_expr = mt.GT.is_non_ref() & get_adj_expr(
                mt.GT, mt.GQ, mt.DP, mt.AD, haploid_adj_dp=5)
        mt = mt.filter_entries(entries_filter_expr)

        # Annotate genes and
        mt = mt.annotate_rows(gene=hl.set(
            mt.tx_annotation.map(
                lambda x: hl.struct(gene_symbol=x.symbol, gene_id=x.ensg))))

        # Aggregate by gene
        mt = mt.explode_rows(mt.gene)
        mt = mt.annotate_rows(tx_annotation=mt.tx_annotation.filter(lambda x: (
            x.symbol == mt.gene.gene_symbol) & (x.ensg == mt.gene.gene_id)))
        # mt.write('gs://gnomad/projects/compound_hets/myoseq/MacArthur_LGMD_Callset_Jan2019_filtered_gene_exploded.mt', overwrite=True)

        # TODO: Add pext to missense counts

        # mt = hl.read_matrix_table('gs://gnomad/projects/compound_hets/myoseq/MacArthur_LGMD_Callset_Jan2019_filtered_gene_exploded.mt')
        mt = mt.group_rows_by(**mt.gene).aggregate(
            locus_interval=hl.locus_interval(hl.agg.take(mt.locus,
                                                         1)[0].contig,
                                             hl.agg.min(mt.locus.position),
                                             hl.agg.max(mt.locus.position),
                                             includes_end=True),
            n_het_lof=hl.agg.count_where(
                mt.GT.is_het()
                & mt.tx_annotation.any(lambda x: x.lof == 'HC')),
            n_hom_lof=hl.agg.count_where(
                mt.GT.is_hom_var()
                & mt.tx_annotation.any(lambda x: x.lof == 'HC')),
            n_het_lof_pext=hl.agg.count_where(mt.GT.is_het(
            ) & mt.tx_annotation.any(lambda x: (x.lof == 'HC') &
                                     (x.Muscle_Skeletal >= args.pext_cutoff))),
            n_hom_lof_pext=hl.agg.count_where(mt.GT.is_hom_var(
            ) & mt.tx_annotation.any(lambda x: (x.lof == 'HC') &
                                     (x.Muscle_Skeletal >= args.pext_cutoff))),
            n_het_missense=hl.agg.count_where(
                mt.GT.is_het()
                & mt.tx_annotation.any(lambda x: x.csq == 'missense_variant')),
            n_hom_missense=hl.agg.count_where(
                mt.GT.is_hom_var()
                & mt.tx_annotation.any(lambda x: x.csq == 'missense_variant')),
            n_het_damaging_missense=hl.agg.count_where(
                mt.GT.is_het() & mt.tx_annotation.any(
                    lambda x: (x.polyphen_prediction == 'probably damaging') |
                    (x.sift_prediction == 'deleterious'))),
            n_hom_damaging_missense=hl.agg.count_where(
                mt.GT.is_hom_var() & mt.tx_annotation.any(
                    lambda x: (x.polyphen_prediction == 'probably damaging') |
                    (x.sift_prediction == 'deleterious'))),
            n_het_synonymous=hl.agg.count_where(mt.GT.is_het(
            ) & mt.tx_annotation.any(lambda x: x.csq == 'synonymous_variant')),
            n_hom_synonymous=hl.agg.count_where(mt.GT.is_hom_var(
            ) & mt.tx_annotation.any(lambda x: x.csq == 'synonymous_variant'))
        ).write(
            'gs://gnomad/projects/compound_hets/myoseq/MacArthur_LGMD_Callset_Jan2019_gene_burden.mt',
            overwrite=args.overwrite)

    if args.run_burden_tests:
        mt = hl.read_matrix_table(
            'gs://gnomad/projects/compound_hets/myoseq/MacArthur_LGMD_Callset_Jan2019_gene_burden.mt'
        )

        def fet_expr(het_count_exp: hl.expr.Int64Expression,
                     hom_count_expr: hl.expr.Int64Expression):
            return hl.bind(
                lambda x: hl.struct(
                    counts=x,
                    dominant=hl.fisher_exact_test(x[0][0], x[0][1] + x[0][2],
                                                  x[1][0], x[1][1] + x[1][2]),
                    recessive=hl.fisher_exact_test(x[0][0] + x[0][1], x[0][
                        2], x[1][0] + x[1][1], x[1][2])),
                hl.bind(
                    lambda x: [
                        [
                            hl.int32(
                                hl.cond(x.contains(False), x[False].get(0, 0),
                                        0)),
                            hl.int32(
                                hl.cond(x.contains(False), x[False].get(1, 0),
                                        0)),
                            hl.int32(
                                hl.cond(x.contains(False), x[False].get(2, 0),
                                        0))
                        ],
                        [
                            hl.int32(
                                hl.cond(x.contains(True), x[True].get(0, 0), 0)
                            ),
                            hl.int32(
                                hl.cond(x.contains(True), x[True].get(1, 0), 0)
                            ),
                            hl.int32(
                                hl.cond(x.contains(True), x[True].get(2, 0), 0)
                            )
                        ],
                    ],
                    hl.agg.group_by(
                        mt.is_case,
                        hl.agg.counter(
                            hl.min(2, het_count_exp + 2 * hom_count_expr)))))

        mt = mt.annotate_rows(
            **{
                'lof':
                fet_expr(mt.n_het_lof, mt.n_hom_lof),
                'lof_pext':
                fet_expr(mt.n_het_lof_pext, mt.n_hom_lof_pext),
                'lof_missense':
                fet_expr(mt.n_het_lof + mt.n_het_missense, mt.n_het_lof +
                         mt.n_hom_missense),
                'lof_damaging_missense':
                fet_expr(mt.n_het_lof +
                         mt.n_het_damaging_missense, mt.n_het_lof +
                         mt.n_hom_damaging_missense),
                'synonymous':
                fet_expr(mt.n_het_synonymous, mt.n_hom_synonymous)
            })

        mt.write(
            'gs://gnomad/projects/compound_hets/myoseq/MacArthur_LGMD_Callset_Jan2019_gene_burden_tests.mt',
            overwrite=args.overwrite)
Пример #15
0
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table:
    """Import a GTF file.

       The GTF file format is identical to the GFF version 2 file format,
       and so this function can be used to import GFF version 2 files as
       well.

       See https://www.ensembl.org/info/website/upload/gff.html for more
       details on the GTF/GFF2 file format.

       The :class:`.Table` returned by this function will be keyed by the
       ``interval`` row field and will include the following row fields:

       .. code-block:: text

           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'interval': interval<>

       There will also be corresponding fields for every tag found in the
       attribute field of the GTF file.

       Note
       ----

       This function will return an ``interval`` field of type :class:`.tinterval`
       constructed from the ``seqname``, ``start``, and ``end`` fields in the
       GTF file. This interval is inclusive of both the start and end positions
       in the GTF file. 

       If the ``reference_genome`` parameter is specified, the start and end
       points of the ``interval`` field will be of type :class:`.tlocus`.
       Otherwise, the start and end points of the ``interval`` field will be of
       type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and
       ``position`` (type :class:`.tint32`).

       Furthermore, if the ``reference_genome`` parameter is specified and
       ``skip_invalid_contigs`` is ``True``, this import function will skip
       lines in the GTF where ``seqname`` is not consistent with the reference
       genome specified.

       Example
       -------

       >>> ht = hl.experimental.import_gtf('data/test.gtf', 
       ...                                 reference_genome='GRCh37',
       ...                                 skip_invalid_contigs=True)

       >>> ht.describe()  # doctest: +NOTEST
       ----------------------------------------
       Global fields:
       None
       ----------------------------------------
       Row fields:
           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'gene_type': str
           'exon_id': str
           'havana_transcript': str
           'level': str
           'transcript_name': str
           'gene_status': str
           'gene_id': str
           'transcript_type': str
           'tag': str
           'transcript_status': str
           'gene_name': str
           'transcript_id': str
           'exon_number': str
           'havana_gene': str
           'interval': interval<locus<GRCh37>>
       ----------------------------------------
       Key: ['interval']
       ----------------------------------------

       Parameters
       ----------

       path : :obj:`str`
           File to import.
       reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional
           Reference genome to use.
       skip_invalid_contigs : :obj:`bool`
           If ``True`` and `reference_genome` is not ``None``, skip lines where
           ``seqname`` is not consistent with the reference genome.
       min_partitions : :obj:`int` or :obj:`None`
           Minimum number of partitions (passed to import_table).

       Returns
       -------
       :class:`.Table`
       """

    ht = hl.import_table(path,
                         min_partitions=min_partitions,
                         comment='#',
                         no_header=True,
                         types={'f3': hl.tint,
                                'f4': hl.tint,
                                'f5': hl.tfloat,
                                'f7': hl.tint},
                         missing='.',
                         delimiter='\t')

    ht = ht.rename({'f0': 'seqname',
                    'f1': 'source',
                    'f2': 'feature',
                    'f3': 'start',
                    'f4': 'end',
                    'f5': 'score',
                    'f6': 'strand',
                    'f7': 'frame',
                    'f8': 'attribute'})

    ht = ht.annotate(attribute=hl.dict(
        hl.map(lambda x: (x.split(' ')[0],
                          x.split(' ')[1].replace('"', '').replace(';$', '')),
               ht['attribute'].split('; '))))

    attributes = ht.aggregate(hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys()))

    ht = ht.transmute(**{x: hl.or_missing(ht['attribute'].contains(x),
                                          ht['attribute'][x])
                         for x in attributes if x})

    if reference_genome:
        if reference_genome == 'GRCh37':
            ht = ht.annotate(seqname=ht['seqname'].replace('^chr', ''))
        else:
            ht = ht.annotate(seqname=hl.case()
                                       .when(ht['seqname'].startswith('HLA'), ht['seqname'])
                                       .when(ht['seqname'].startswith('chrHLA'), ht['seqname'].replace('^chr', ''))
                                       .when(ht['seqname'].startswith('chr'), ht['seqname'])
                                       .default('chr' + ht['seqname']))
        if skip_invalid_contigs:
            valid_contigs = hl.literal(set(hl.get_reference(reference_genome).contigs))
            ht = ht.filter(valid_contigs.contains(ht['seqname']))
        ht = ht.transmute(interval=hl.locus_interval(ht['seqname'],
                                                     ht['start'],
                                                     ht['end'],
                                                     includes_start=True,
                                                     includes_end=True,
                                                     reference_genome=reference_genome))
    else:
        ht = ht.transmute(interval=hl.interval(hl.struct(seqname=ht['seqname'], position=ht['start']),
                                               hl.struct(seqname=ht['seqname'], position=ht['end']),
                                               includes_start=True,
                                               includes_end=True))

    ht = ht.key_by('interval')

    return ht
Пример #16
0
            ';$', '')), ht_genes['attribute'].split('; '))))

attributes = ht_genes.aggregate(
    hl.agg.explode(lambda x: hl.agg.collect_as_set(x),
                   ht_genes['attribute'].keys()))

ht_genes = ht_genes.transmute(
    **{
        x: hl.or_missing(ht_genes['attribute'].contains(x),
                         ht_genes['attribute'][x])
        for x in attributes if x
    })

ht_genes = ht_genes.annotate(
    gene_interval=hl.locus_interval(ht_genes['seqname'],
                                    ht_genes['start'],
                                    ht_genes['end'] + 1,
                                    reference_genome='GRCh37'))
ht_genes = ht_genes.filter(ht_genes['feature'] == 'gene')
ht_genes = ht_genes.key_by('gene_id')
ht_genes = ht_genes.select('gene_interval', 'source', 'gene_name',
                           'havana_gene', 'gene_type', 'gene_status', 'level',
                           'score', 'strand', 'frame', 'tag')
ht_genes = ht_genes.rename({
    'gene_name': 'gene_symbol',
    'havana_gene': 'havana_gene_id'
})
ht_genes.write('hdfs:///tmp/genes.ht', overwrite=True)
ht_genes = hl.read_table('hdfs:///tmp/genes.ht')

# gene read counts
name = 'GTEx_RNA_seq_gene_read_counts'