Пример #1
0
def set_female_y_metrics_to_na_expr(
        t: Union[hl.Table, hl.MatrixTable]) -> hl.expr.ArrayExpression:
    """
    Set Y-variant frequency callstats for female-specific metrics to missing structs.

    .. note:: Requires freq, freq_meta, and freq_index_dict annotations to be present in Table or MatrixTable

    :param t: Table or MatrixTable for which to adjust female metrics
    :return: Hail array expression to set female Y-variant metrics to missing values
    """
    female_idx = hl.map(
        lambda x: t.freq_index_dict[x],
        hl.filter(lambda x: x.contains("XX"), t.freq_index_dict.keys()),
    )
    freq_idx_range = hl.range(hl.len(t.freq_meta))

    new_freq_expr = hl.if_else(
        (t.locus.in_y_nonpar() | t.locus.in_y_par()),
        hl.map(
            lambda x: hl.if_else(female_idx.contains(x),
                                 missing_callstats_expr(), t.freq[x]),
            freq_idx_range,
        ),
        t.freq,
    )

    return new_freq_expr
Пример #2
0
def test_ndarray_map():
    a = hl._ndarray([[2, 3, 4], [5, 6, 7]])
    b = hl.map(lambda x: -x, a)
    c = hl.map(lambda x: True, a)

    assert_ndarrays_eq(
        (b, [[-2, -3, -4], [-5, -6, -7]]),
        (c, [[True, True, True],
             [True, True, True]]))
Пример #3
0
def test_ndarray_map():
    a = hl.nd.array([[2, 3, 4], [5, 6, 7]])
    b = hl.map(lambda x: -x, a)
    c = hl.map(lambda x: True, a)

    assert_ndarrays_eq((b, [[-2, -3, -4], [-5, -6, -7]]),
                       (c, [[True, True, True], [True, True, True]]))

    assert hl.eval(hl.null(hl.tndarray(hl.tfloat,
                                       1)).map(lambda x: x * 2)) is None
Пример #4
0
def impute_sex_aggregator(call,
                          aaf,
                          aaf_threshold=0.0,
                          include_par=False,
                          female_threshold=0.4,
                          male_threshold=0.8) -> hl.Table:
    """:func:`.impute_sex` as an aggregator."""
    mt = call._indices.source
    rg = mt.locus.dtype.reference_genome
    x_contigs = hl.literal(
        hl.eval(
            hl.map(lambda x_contig: hl.parse_locus_interval(x_contig, rg),
                   rg.x_contigs)))
    inbreeding = hl.agg.inbreeding(call, aaf)
    is_female = hl.if_else(
        inbreeding.f_stat < female_threshold, True,
        hl.if_else(inbreeding.f_stat > male_threshold, False,
                   hl.is_missing('tbool')))
    expression = hl.struct(is_female=is_female, **inbreeding)
    if not include_par:
        interval_type = hl.tarray(hl.tinterval(hl.tlocus(rg)))
        par_intervals = hl.literal(rg.par, interval_type)
        expression = hl.agg.filter(
            ~par_intervals.any(
                lambda par_interval: par_interval.contains(mt.locus)),
            expression)
    expression = hl.agg.filter(
        (aaf > aaf_threshold) & (aaf < (1 - aaf_threshold)), expression)
    expression = hl.agg.filter(
        x_contigs.any(lambda contig: contig.contains(mt.locus)), expression)

    return expression
Пример #5
0
def match_variants(gwas, reference):
    """
    Groups `gwas` and `reference` by the row key (e.g. locus) and then
    compares the allele fields to determine whether a strand flip has occurred.

    Assumes `gwas` row key and `reference` row key are the same type.
    Assumes both `gwas` and `reference` have a row field `alleles` of
    type <array<str>>. `alleles` cannot be a row key of the dataset.
    """
    def has_field_of_type(source, name, dtype):
        return name in source.row and source[name].dtype == dtype

    if not has_field_of_type(gwas, 'alleles', hl.tarray(hl.tstr)):
        raise TypeError(
            "'gwas' must have a row field 'alleles' with type <array<str>>")

    if not has_field_of_type(reference, 'alleles', hl.tarray(hl.tstr)):
        raise TypeError(
            "'reference' must have a row field 'alleles' with type <array<str>>"
        )

    if 'alleles' in gwas.key:
        raise TypeError("'alleles' cannot be a row key in 'gwas'.")

    if 'alleles' in reference.key:
        raise TypeError("'alleles' cannot be a row key in 'reference'.")

    reference = reference.collect_by_key()

    matched = gwas.annotate(matches=hl.map(
        lambda x: x.annotate(match_alleles=add_strand_flip_annotation(
            x.alleles[0], x.alleles[1], gwas.alleles[0], gwas.alleles[1])),
        reference[gwas.key].values))

    return matched
Пример #6
0
def _dumps_partitions(partitions, row_key_type):
    parts_type = partitions.dtype
    if not (isinstance(parts_type, hl.tarray)
            and isinstance(parts_type.element_type, hl.tinterval)):
        raise ValueError(
            f'partitions type invalid: {parts_type} must be array of intervals'
        )

    point_type = parts_type.element_type.point_type

    f1, t1 = next(iter(row_key_type.items()))
    if point_type == t1:
        partitions = hl.map(
            lambda x: hl.interval(start=hl.struct(**{f1: x.start}),
                                  end=hl.struct(**{f1: x.end}),
                                  includes_start=x.includes_start,
                                  includes_end=x.includes_end), partitions)
    else:
        if not isinstance(point_type, hl.tstruct):
            raise ValueError(
                f'partitions has wrong type: {point_type} must be struct or type of first row key field'
            )
        if not point_type._is_prefix_of(row_key_type):
            raise ValueError(
                f'partitions type invalid: {point_type} must be prefix of {row_key_type}'
            )

    s = json.dumps(partitions.dtype._convert_to_json(hl.eval(partitions)))
    return s, partitions.dtype
Пример #7
0
def add_popmax_expr(freq: hl.expr.ArrayExpression,
                    freq_meta: hl.expr.ArrayExpression,
                    populations: Set[str]) -> hl.expr.ArrayExpression:
    """
    Calculates popmax (add an additional entry into freq with popmax: pop)

    :param ArrayExpression freq: ArrayExpression of Structs with ['ac', 'an', 'hom']
    :param ArrayExpression freq_meta: ArrayExpression of meta dictionaries corresponding to freq
    :param set of str populations: Set of populations over which to calculate popmax
    :return: Frequency data with annotated popmax
    :rtype: ArrayExpression
    """
    pops_to_use = hl.literal(populations)
    freq = hl.map(lambda x: x[0].annotate(meta=x[1]), hl.zip(freq, freq_meta))
    freq_filtered = hl.filter(
        lambda f: (f.meta.size() == 2) & (f.meta.get('group') == 'adj') &
        pops_to_use.contains(f.meta.get('pop')) & (f.AC > 0), freq)
    sorted_freqs = hl.sorted(freq_filtered, key=lambda x: x.AF, reverse=True)
    return hl.or_missing(
        hl.len(sorted_freqs) > 0,
        hl.struct(AC=sorted_freqs[0].AC,
                  AF=sorted_freqs[0].AF,
                  AN=sorted_freqs[0].AN,
                  homozygote_count=sorted_freqs[0].homozygote_count,
                  pop=sorted_freqs[0].meta['pop']))
def pre_process_subset_freq(subset: str,
                            global_ht: hl.Table,
                            test: bool = False) -> hl.Table:
    """
    Prepare subset frequency Table by filling in missing frequency fields for loci present only in the global cohort.

    .. note::

        The resulting final `freq` array will be as long as the subset `freq_meta` global (i.e., one `freq` entry for each `freq_meta` entry)

    :param subset: subset ID
    :param global_ht: Hail Table containing all variants discovered in the overall release cohort
    :param test: If True, filter to small region on chr20
    :return: Table containing subset frequencies with missing freq structs filled in
    """

    # Read in subset HTs
    subset_ht_path = get_freq(subset=subset).path
    subset_chr20_ht_path = qc_temp_prefix() + f"chr20_test_freq.{subset}.ht"

    if test:
        if file_exists(subset_chr20_ht_path):
            logger.info(
                "Loading chr20 %s subset frequency data for testing: %s",
                subset,
                subset_chr20_ht_path,
            )
            subset_ht = hl.read_table(subset_chr20_ht_path)

        elif file_exists(subset_ht_path):
            logger.info(
                "Loading %s subset frequency data for testing: %s",
                subset,
                subset_ht_path,
            )
            subset_ht = hl.read_table(subset_ht_path)
            subset_ht = hl.filter_intervals(
                subset_ht, [hl.parse_locus_interval("chr20:1-1000000")])

    elif file_exists(subset_ht_path):
        logger.info("Loading %s subset frequency data: %s", subset,
                    subset_ht_path)
        subset_ht = hl.read_table(subset_ht_path)

    else:
        raise DataException(
            f"Hail Table containing {subset} subset frequencies not found. You may need to run the script generate_freq_data.py to generate frequency annotations first."
        )

    # Fill in missing freq structs
    ht = subset_ht.join(global_ht.select().select_globals(), how="right")
    ht = ht.annotate(freq=hl.if_else(
        hl.is_missing(ht.freq),
        hl.map(lambda x: missing_callstats_expr(),
               hl.range(hl.len(ht.freq_meta))),
        ht.freq,
    ))

    return ht
Пример #9
0
 def _coerce(self, x: Expression):
     assert isinstance(x, hl.expr.DictExpression)
     if not self.kc._requires_conversion(x.dtype.key_type):
         # fast path
         return x.map_values(self.vc.coerce)
     else:
         return hl.dict(hl.map(lambda e: (self.kc.coerce(e[0]), self.vc.coerce(e[1])),
                               hl.array(x)))
Пример #10
0
 def _coerce(self, x: Expression):
     assert isinstance(x, hl.expr.DictExpression)
     if not self.kc._requires_conversion(x.dtype.key_type):
         # fast path
         return x.map_values(self.vc.coerce)
     else:
         return hl.dict(hl.map(lambda e: (self.kc.coerce(e[0]), self.vc.coerce(e[1])),
                               hl.array(x)))
Пример #11
0
def gt_to_gp(mt, location: str = 'GP'):
    return mt.annotate_entries(
        **{
            location:
            hl.or_missing(
                hl.is_defined(mt.GT),
                hl.map(
                    lambda i: hl.cond(mt.GT.unphased_diploid_gt_index() == i,
                                      1.0, 0.0),
                    hl.range(0, hl.triangle(hl.len(mt.alleles)))))
        })
Пример #12
0
def densify(sparse_mt):
    """Convert sparse matrix table to a dense VCF-like representation by expanding reference blocks.

    Parameters
    ----------
    sparse_mt : :class:`.MatrixTable`
        Sparse MatrixTable to densify.  The first row key field must
        be named ``locus`` and have type ``locus``.  Must have an
        ``END`` entry field of type ``int32``.

    Returns
    -------
    :class:`.MatrixTable`
        The densified MatrixTable.  The ``END`` entry field is dropped.

    While computationally expensive, this
    operation is necessary for many downstream analyses, and should be thought of as
    roughly costing as much as reading a matrix table created by importing a dense
    project VCF.
    """
    if list(sparse_mt.row_key)[0] != 'locus' or not isinstance(
            sparse_mt.locus.dtype, hl.tlocus):
        raise ValueError(
            "first row key field must be named 'locus' and have type 'locus'")
    if 'END' not in sparse_mt.entry or sparse_mt.END.dtype != hl.tint32:
        raise ValueError(
            "'densify' requires 'END' entry field of type 'int32'")
    col_key_fields = list(sparse_mt.col_key)

    contigs = sparse_mt.locus.dtype.reference_genome.contigs
    contig_idx_map = hl.literal({contigs[i]: i
                                 for i in range(len(contigs))},
                                'dict<str, int32>')
    mt = sparse_mt.annotate_rows(
        __contig_idx=contig_idx_map[sparse_mt.locus.contig])
    mt = mt.annotate_entries(__contig=mt.__contig_idx)

    t = mt._localize_entries('__entries', '__cols')
    t = t.annotate(__entries=hl.rbind(
        hl.scan.array_agg(
            lambda entry: hl.scan._prev_nonnull(
                hl.or_missing(hl.is_defined(entry.END), entry)), t.__entries),
        lambda prev_entries: hl.map(
            lambda i: hl.rbind(
                prev_entries[i], t.__entries[i], lambda prev_entry, entry: hl.
                cond((~hl.is_defined(entry)
                      & (prev_entry.END >= t.locus.position)
                      & (prev_entry.__contig == t.__contig_idx)), prev_entry,
                     entry)), hl.range(0, hl.len(t.__entries)))))
    mt = t._unlocalize_entries('__entries', '__cols', col_key_fields)
    mt = mt.drop('__contig_idx', '__contig', 'END')
    return mt
Пример #13
0
def post_process_gene_map_ht(gene_ht):
    groups = [
        'pLoF', 'missense|LC', 'pLoF|missense|LC', 'synonymous', 'missense'
    ]
    variant_groups = hl.map(
        lambda group: group.split('\\|').flatmap(lambda csq: gene_ht.variants.
                                                 get(csq)), groups)
    gene_ht = gene_ht.transmute(variant_groups=hl.zip(
        groups, variant_groups)).explode('variant_groups')
    gene_ht = gene_ht.transmute(annotation=gene_ht.variant_groups[0],
                                variants=hl.sorted(gene_ht.variant_groups[1]))
    gene_ht = gene_ht.key_by(start=gene_ht.interval.start)
    return gene_ht.filter(hl.len(gene_ht.variants) > 0)
def vcf_to_mt(splice_ai_snvs_path, splice_ai_indels_path, genome_version):
    """
    Loads the snv path and indels source path to a matrix table and returns the table.

    :param splice_ai_snvs_path: source location
    :param splice_ai_indels_path: source location
    :return: matrix table
    """

    logger.info("==> reading in splice_ai vcfs: %s, %s" %
                (splice_ai_snvs_path, splice_ai_indels_path))

    # for 37, extract to MT, for 38, MT not included.
    interval = "1-MT" if genome_version == "37" else "chr1-chrY"
    contig_dict = None
    if genome_version == "38":
        contig_dict = NO_CHR_TO_CHR_CONTIG_RECODING

    mt = hl.import_vcf(
        [splice_ai_snvs_path, splice_ai_indels_path],
        reference_genome=f"GRCh{genome_version}",
        contig_recoding=contig_dict,
        force_bgz=True,
        min_partitions=10000,
        skip_invalid_loci=True,
    )
    interval = [
        hl.parse_locus_interval(interval,
                                reference_genome=f"GRCh{genome_version}")
    ]
    mt = hl.filter_intervals(mt, interval)

    # Split SpliceAI field by | delimiter. Capture delta score entries and map to floats
    delta_scores = mt.info.SpliceAI[0].split(delim="\\|")[2:6]
    splice_split = mt.info.annotate(
        SpliceAI=hl.map(lambda x: hl.float32(x), delta_scores))
    mt = mt.annotate_rows(info=splice_split)

    # Annotate info.max_DS with the max of DS_AG, DS_AL, DS_DG, DS_DL in info.
    # delta_score array is |DS_AG|DS_AL|DS_DG|DS_DL
    consequences = hl.literal(
        ["Acceptor gain", "Acceptor loss", "Donor gain", "Donor loss"])
    mt = mt.annotate_rows(info=mt.info.annotate(
        max_DS=hl.max(mt.info.SpliceAI)))
    mt = mt.annotate_rows(info=mt.info.annotate(splice_consequence=hl.if_else(
        mt.info.max_DS > 0,
        consequences[mt.info.SpliceAI.index(mt.info.max_DS)],
        "No consequence",
    )))
    return mt
Пример #15
0
def get_expr_for_vep_all_consequences_array(vep_root):
    retval = vep_root.annotate(formattedIntergenic=hl.map(
        lambda x: hl.struct(
            allele_num=x.allele_num,
            amino_acids=hl.null(hl.tstr),
            biotype=hl.literal("intron"),
            canonical=hl.null(hl.tint32),
            ccds=hl.null(hl.tstr),
            cdna_start=hl.null(hl.tint32),
            cdna_end=hl.null(hl.tint32),
            cds_end=hl.null(hl.tint32),
            cds_start=hl.null(hl.tint32),
            codons=hl.null(hl.tstr),
            consequence_terms=x.consequence_terms,
            distance=hl.null(hl.tint32),  # not na...,
            domains=hl.null(hl.tarray(hl.tstruct(db=hl.tstr, name=hl.tstr))),
            exon=hl.null(hl.tstr),
            gene_id=hl.null(hl.tstr),
            gene_pheno=hl.null(hl.tint32),  # not na...,
            gene_symbol=hl.null(hl.tstr),
            gene_symbol_source=hl.null(hl.tstr),
            hgnc_id=hl.null(hl.tstr),
            hgvsc=hl.null(hl.tstr),
            hgvsp=hl.null(hl.tstr),
            hgvs_offset=hl.null(hl.tint32),
            impact=x.impact,
            intron=hl.null(hl.tstr),
            lof=hl.null(hl.tstr),
            lof_flags=hl.null(hl.tstr),
            lof_filter=hl.null(hl.tstr),
            lof_info=hl.null(hl.tstr),
            minimised=x.minimised,
            polyphen_prediction=hl.null(hl.tstr),
            polyphen_score=hl.null(hl.tfloat64),
            protein_end=hl.null(hl.tint32),
            protein_start=hl.null(hl.tint32),
            protein_id=hl.null(hl.tstr),
            sift_prediction=hl.null(hl.tstr),
            sift_score=hl.null(hl.tfloat64),
            strand=hl.null(hl.tint32),
            swissprot=hl.null(hl.tstr),
            transcript_id=hl.null(hl.tstr),
            trembl=hl.null(hl.tstr),
            uniparc=hl.null(hl.tstr),
            variant_allele=x.variant_allele),
        vep_root.intergenic_consequences))
    vep_root = retval.annotate(all_consequences=hl.cond(
        hl.is_missing(retval.transcript_consequences),
        retval.formattedIntergenic, retval.transcript_consequences))
    return vep_root
Пример #16
0
def densify(sparse_mt):
    """Convert sparse MatrixTable to a dense one.

    Parameters
    ----------
    sparse_mt : :class:`.MatrixTable`
        Sparse MatrixTable to densify.  The first row key field must
        be named ``locus`` and have type ``locus``.  Must have an
        ``END`` entry field of type ``int32``.

    Returns
    -------
    :class:`.MatrixTable`
        The densified MatrixTable.  The ``END`` entry field is dropped.

    """
    if list(sparse_mt.row_key)[0] != 'locus' or not isinstance(
            sparse_mt.locus.dtype, hl.tlocus):
        raise ValueError(
            "first row key field must be named 'locus' and have type 'locus'")
    if 'END' not in sparse_mt.entry or sparse_mt.END.dtype != hl.tint32:
        raise ValueError(
            "'densify' requires 'END' entry field of type 'int32'")
    col_key_fields = list(sparse_mt.col_key)

    contigs = sparse_mt.locus.dtype.reference_genome.contigs
    contig_idx_map = hl.literal({contigs[i]: i
                                 for i in range(len(contigs))},
                                'dict<str, int32>')
    mt = sparse_mt.annotate_rows(
        __contig_idx=contig_idx_map[sparse_mt.locus.contig])
    mt = mt.annotate_entries(__contig=mt.__contig_idx)

    t = mt._localize_entries('__entries', '__cols')
    t = t.annotate(__entries=hl.rbind(
        hl.scan.array_agg(
            lambda entry: hl.scan._prev_nonnull(
                hl.or_missing(hl.is_defined(entry.END), entry)), t.__entries),
        lambda prev_entries: hl.map(
            lambda i: hl.rbind(
                prev_entries[i], t.__entries[i], lambda prev_entry, entry: hl.
                cond((~hl.is_defined(entry) &
                      (prev_entry.END >= t.locus.position) &
                      (prev_entry.__contig == t.__contig_idx)), prev_entry,
                     entry)), hl.range(0, hl.len(t.__entries)))))
    mt = t._unlocalize_entries('__entries', '__cols', col_key_fields)
    mt = mt.drop('__contig_idx', '__contig', 'END')
    return mt
Пример #17
0
def densify(sparse_mt):
    """Convert sparse MatrixTable to a dense one.

    Parameters
    ----------
    sparse_mt : :class:`.MatrixTable`
        Sparse MatrixTable to densify.  The first row key field must
        be named ``locus`` and have type ``locus``.  Must have an
        ``END`` entry field of type ``int32``.

    Returns
    -------
    :class:`.MatrixTable`
        The densified MatrixTable.  The ``END`` entry field is dropped.

    """
    if list(sparse_mt.row_key)[0] != 'locus' or not isinstance(sparse_mt.locus.dtype, hl.tlocus):
        raise ValueError("first row key field must be named 'locus' and have type 'locus'")
    if 'END' not in sparse_mt.entry or sparse_mt.END.dtype != hl.tint32:
        raise ValueError("'densify' requires 'END' entry field of type 'int32'")
    col_key_fields = list(sparse_mt.col_key)

    mt = sparse_mt
    mt = sparse_mt.annotate_entries(__contig = mt.locus.contig)
    t = mt._localize_entries('__entries', '__cols')
    t = t.annotate(
        __entries = hl.rbind(
            hl.scan.array_agg(
                lambda entry: hl.scan._prev_nonnull(hl.or_missing(hl.is_defined(entry.END), entry)),
                t.__entries),
            lambda prev_entries: hl.map(
                lambda i:
                hl.rbind(
                    prev_entries[i], t.__entries[i],
                    lambda prev_entry, entry:
                    hl.cond(
                        (~hl.is_defined(entry) &
                         (prev_entry.END >= t.locus.position) &
                         (prev_entry.__contig == t.locus.contig)),
                        prev_entry,
                        entry)),
                hl.range(0, hl.len(t.__entries)))))
    mt = t._unlocalize_entries('__entries', '__cols', col_key_fields)
    mt = mt.drop('__contig', 'END')
    return mt
Пример #18
0
Файл: nd.py Проект: saponas/hail
def vstack(arrs):
    """
    Stack arrays in sequence vertically (row wise).
    1-D arrays of shape `(N,)`, will reshaped to `(1,N)` before concatenation.
    For all other arrays, equivalent to  :func:`.concatenate` with axis=0.

    Parameters
    ----------
    arrs : sequence of :class:`.NDArrayExpression`
        The arrays must have the same shape along all but the first axis.
        1-D arrays must have the same length.

    Returns
    -------
    stacked : :class:`.NDArrayExpression`
        The array formed by stacking the given arrays, will be at least 2-D.

    See Also
    --------
    :func:`.concatenate` : Join a sequence of arrays along an existing axis.

    Examples
    --------
    >>> a = hl.nd.array([1, 2, 3])
    >>> b = hl.nd.array([2, 3, 4])
    >>> hl.eval(hl.nd.vstack((a,b)))
    array([[1, 2, 3],
           [2, 3, 4]], dtype=int32)
    >>> a = hl.nd.array([[1], [2], [3]])
    >>> b = hl.nd.array([[2], [3], [4]])
    >>> hl.eval(hl.nd.vstack((a,b)))
    array([[1],
           [2],
           [3],
           [2],
           [3],
           [4]], dtype=int32)
    """
    head_ndim = arrs[0].ndim

    if head_ndim == 1:
        return concatenate(hl.map(lambda a: a._broadcast(2), arrs), 0)

    return concatenate(arrs, 0)
Пример #19
0
def all_and_leave_one_out(x,
                          pop_array,
                          all_f=hl.sum,
                          loo_f=lambda i, x: hl.sum(x) - hl.or_else(x[i], 0)):
    """
    Applies a function to an input array for all populations, and for each of leave-one-out populations.

    :param x: Input array
    :param pop_array: Population array
    :param all_f: Function for all populations. It takes the input array and returns a new value
    :param loo_f: Function for each of leave-one-out populations. It takes an index of leave-one-out
                  population and the input array, and returns an array of new values.
    ...
    :return: Array of new values for all populations and for each of leave-one-out populations.
    :rtype: ArrayExpression
    """
    arr = hl.array([all_f(x)])
    arr = arr.extend(hl.map(lambda i: loo_f(i, x),
                            hl.range(hl.len(pop_array))))
    return hl.or_missing(hl.any(hl.is_defined, x), arr)
Пример #20
0
def match_variants(gwas_row_key, gwas_alleles, reference_row_key,
                   reference_alleles):
    """
    Assumes gwas row key and reference row key are same type and are keys of datasets already
    """
    assert (gwas_row_key.dtype == reference_row_key.dtype
            )  # FIXME: Allow locus to be matched with tstruct(contig, pos)
    assert (gwas_row_key._indices == gwas_alleles._indices)
    assert (reference_row_key._indices == reference_alleles._indices)
    # FIXME: assert all row indices

    reference_alleles_fn = reference_alleles._ast.name

    gwas = gwas_row_key._indices.source
    reference = reference_row_key._indices.source

    reference = reference.collect_by_key()

    matched_t = gwas.annotate(matches=hl.map(
        lambda x: x.annotate(match_alleles=add_strand_flip_annotation(
            x[reference_alleles_fn][0], x[reference_alleles_fn][1],
            gwas_alleles[0], gwas_alleles[1])),
        reference[gwas_row_key].values))

    # t = matched_t.annotate(matches=hl.map(lambda x: x.annotate(flip=add_strand_flip_annotation(x[???], x[???], matched_t[???], matched_t[???])) , matched_t.matches))

    # t = t.select(t.locus, t.A1, t.A2, matches = hl.map(lambda x: x, t.matches))

    # Table(gwas_row_key,
    #       matches[struct{
    #                       ref_index = int,
    #                       ref_locus = locus,
    #                       ref_rsid = str,
    #                       ref_alleles = (ref,alt),
    #                       allele_swap = bool,
    #                       strand_flip = bool
    #                   }, ...
    #               ]
    #       )
    return matched_t
Пример #21
0
def trio_matrix(dataset, pedigree, complete_trios=False) -> MatrixTable:
    """Builds and returns a matrix where columns correspond to trios and entries contain genotypes for the trio.

    .. include:: ../_templates/req_tstring.rst

    Examples
    --------

    Create a trio matrix:

    >>> pedigree = hl.Pedigree.read('data/case_control_study.fam')
    >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True)

    Notes
    -----

    This method builds a new matrix table with one column per trio. If
    `complete_trios` is ``True``, then only trios that satisfy
    :meth:`.Trio.is_complete` are included. In this new dataset, the column
    identifiers are the sample IDs of the trio probands. The column fields and
    entries of the matrix are changed in the following ways:

    The new column fields consist of three structs (`proband`, `father`,
    `mother`), a Boolean field, and a string field:

    - **proband** (:class:`.tstruct`) - Column fields on the proband.
    - **father** (:class:`.tstruct`) - Column fields on the father.
    - **mother** (:class:`.tstruct`) - Column fields on the mother.
    - **id** (:py:data:`.tstr`) - Column key for the proband.
    - **is_female** (:py:data:`.tbool`) - Proband is female.
      ``True`` for female, ``False`` for male, missing if unknown.
    - **fam_id** (:py:data:`.tstr`) - Family ID.

    The new entry fields are:

    - **proband_entry** (:class:`.tstruct`) - Proband entry fields.
    - **father_entry** (:class:`.tstruct`) - Father entry fields.
    - **mother_entry** (:class:`.tstruct`) - Mother entry fields.

    Parameters
    ----------
    pedigree : :class:`.Pedigree`

    Returns
    -------
    :class:`.MatrixTable`
    """
    mt = dataset
    require_col_key_str(mt, "trio_matrix")

    k = mt.col_key.dtype.fields[0]
    samples = mt[k].collect()

    pedigree = pedigree.filter_to(samples)
    trios = pedigree.complete_trios() if complete_trios else pedigree.trios
    n_trios = len(trios)

    sample_idx = {}
    for i, s in enumerate(samples):
        sample_idx[s] = i

    trios = [
        hl.Struct(id=sample_idx[t.s],
                  pat_id=None if t.pat_id is None else sample_idx[t.pat_id],
                  mat_id=None if t.mat_id is None else sample_idx[t.mat_id],
                  is_female=t.is_female,
                  fam_id=t.fam_id) for t in trios
    ]
    trios_type = hl.dtype(
        'array<struct{id:int,pat_id:int,mat_id:int,is_female:bool,fam_id:str}>'
    )

    trios_sym = Env.get_uid()
    entries_sym = Env.get_uid()
    cols_sym = Env.get_uid()

    mt = mt.annotate_globals(**{trios_sym: hl.literal(trios, trios_type)})
    mt = mt._localize_entries(entries_sym, cols_sym)
    mt = mt.annotate_globals(
        **{
            cols_sym:
            hl.map(
                lambda i: hl.bind(
                    lambda t: hl.struct(id=mt[cols_sym][t.id][k],
                                        proband=mt[cols_sym][t.id],
                                        father=mt[cols_sym][t.pat_id],
                                        mother=mt[cols_sym][t.mat_id],
                                        is_female=t.is_female,
                                        fam_id=t.fam_id), mt[trios_sym][i]),
                hl.range(0, n_trios))
        })
    mt = mt.annotate(
        **{
            entries_sym:
            hl.map(
                lambda i: hl.bind(
                    lambda t: hl.struct(proband_entry=mt[entries_sym][t.id],
                                        father_entry=mt[entries_sym][t.pat_id],
                                        mother_entry=mt[entries_sym][t.mat_id]
                                        ), mt[trios_sym][i]),
                hl.range(0, n_trios))
        })
    mt = mt.drop(trios_sym)

    return mt._unlocalize_entries(entries_sym, cols_sym, ['id'])
Пример #22
0
def main(args):
    hl.init()

    # Read in all sumstats
    mt = load_final_sumstats_mt(filter_phenos=True,
                                filter_variants=False,
                                filter_sumstats=True,
                                separate_columns_by_pop=False,
                                annotate_with_nearest_gene=False)

    # Annotate per-entry sample size
    def get_n(pheno_data, i):
        return pheno_data[i].n_cases + hl.or_else(pheno_data[i].n_controls, 0)

    mt = mt.annotate_entries(summary_stats=hl.map(
        lambda x: x[1].annotate(N=hl.or_missing(hl.is_defined(x[1]),
                                                get_n(mt.pheno_data, x[0]))),
        hl.zip_with_index(mt.summary_stats)))

    # Exclude entries with low confidence flag.
    if not args.keep_low_confidence_variants:
        mt = mt.annotate_entries(summary_stats=hl.map(
            lambda x: hl.or_missing(~x.low_confidence, x), mt.summary_stats))

    # Run fixed-effect meta-analysis (all + leave-one-out)
    mt = mt.annotate_entries(unnorm_beta=mt.summary_stats.BETA /
                             (mt.summary_stats.SE**2),
                             inv_se2=1 / (mt.summary_stats.SE**2))
    mt = mt.annotate_entries(
        sum_unnorm_beta=all_and_leave_one_out(mt.unnorm_beta,
                                              mt.pheno_data.pop),
        sum_inv_se2=all_and_leave_one_out(mt.inv_se2, mt.pheno_data.pop))
    mt = mt.transmute_entries(META_BETA=mt.sum_unnorm_beta / mt.sum_inv_se2,
                              META_SE=hl.map(lambda x: hl.sqrt(1 / x),
                                             mt.sum_inv_se2))
    mt = mt.annotate_entries(
        META_Pvalue=hl.map(lambda x: 2 * hl.pnorm(x), -hl.abs(mt.META_BETA /
                                                              mt.META_SE)))

    # Run heterogeneity test (Cochran's Q)
    mt = mt.annotate_entries(META_Q=hl.map(
        lambda x: hl.sum((mt.summary_stats.BETA - x)**2 * mt.inv_se2),
        mt.META_BETA),
                             variant_exists=hl.map(lambda x: ~hl.is_missing(x),
                                                   mt.summary_stats.BETA))
    mt = mt.annotate_entries(META_N_pops=all_and_leave_one_out(
        mt.variant_exists, mt.pheno_data.pop))
    mt = mt.annotate_entries(META_Pvalue_het=hl.map(
        lambda i: hl.pchisqtail(mt.META_Q[i], mt.META_N_pops[i] - 1),
        hl.range(hl.len(mt.META_Q))))

    # Add other annotations
    mt = mt.annotate_entries(
        ac_cases=hl.map(lambda x: x["AF.Cases"] * x.N, mt.summary_stats),
        ac_controls=hl.map(lambda x: x["AF.Controls"] * x.N, mt.summary_stats),
        META_AC_Allele2=all_and_leave_one_out(
            mt.summary_stats.AF_Allele2 * mt.summary_stats.N,
            mt.pheno_data.pop),
        META_N=all_and_leave_one_out(mt.summary_stats.N, mt.pheno_data.pop))
    mt = mt.annotate_entries(
        META_AF_Allele2=mt.META_AC_Allele2 / mt.META_N,
        META_AF_Cases=all_and_leave_one_out(mt.ac_cases, mt.pheno_data.pop) /
        mt.META_N,
        META_AF_Controls=all_and_leave_one_out(mt.ac_controls,
                                               mt.pheno_data.pop) / mt.META_N)
    mt = mt.drop('unnorm_beta', 'inv_se2', 'variant_exists', 'ac_cases',
                 'ac_controls', 'summary_stats', 'META_AC_Allele2')

    # Format everything into array<struct>
    def is_finite_or_missing(x):
        return (hl.or_missing(hl.is_finite(x), x))

    meta_fields = [
        'BETA', 'SE', 'Pvalue', 'Q', 'Pvalue_het', 'N', 'N_pops', 'AF_Allele2',
        'AF_Cases', 'AF_Controls'
    ]
    mt = mt.transmute_entries(meta_analysis=hl.map(
        lambda i: hl.struct(
            **{
                field: is_finite_or_missing(mt[f'META_{field}'][i])
                for field in meta_fields
            }), hl.range(hl.len(mt.META_BETA))))

    col_fields = ['n_cases', 'n_controls']
    mt = mt.annotate_cols(
        **{
            field: all_and_leave_one_out(mt.pheno_data[field],
                                         mt.pheno_data.pop)
            for field in col_fields
        })
    col_fields += ['pop']
    mt = mt.annotate_cols(pop=all_and_leave_one_out(
        mt.pheno_data.pop,
        mt.pheno_data.pop,
        all_f=lambda x: x,
        loo_f=lambda i, x: hl.filter(lambda y: y != x[i], x),
    ))
    mt = mt.transmute_cols(meta_analysis_data=hl.map(
        lambda i: hl.struct(**{field: mt[field][i]
                               for field in col_fields}),
        hl.range(hl.len(mt.pop))))

    mt.describe()
    mt.write(get_meta_analysis_results_path(), overwrite=args.overwrite)

    hl.copy_log('gs://ukb-diverse-pops/combined_results/meta_analysis.log')
Пример #23
0
def ld_score_regression(weight_expr,
                        ld_score_expr,
                        chi_sq_exprs,
                        n_samples_exprs,
                        n_blocks=200,
                        two_step_threshold=30,
                        n_reference_panel_variants=None) -> Table:
    r"""Estimate SNP-heritability and level of confounding biases from
    GWAS summary statistics.

    Given a set or multiple sets of genome-wide association study (GWAS)
    summary statistics, :func:`.ld_score_regression` estimates the heritability
    of a trait or set of traits and the level of confounding biases present in
    the underlying studies by regressing chi-squared statistics on LD scores,
    leveraging the model:

    .. math::

        \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j

    *  :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic
       for variant :math:`j` resulting from a test of association between
       variant :math:`j` and a trait.
    *  :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant
       :math:`j`, calculated as the sum of squared correlation coefficients
       between variant :math:`j` and nearby variants. See :func:`ld_score`
       for further details.
    *  :math:`a` captures the contribution of confounding biases, such as
       cryptic relatedness and uncontrolled population structure, to the
       association test statistic.
    *  :math:`h_g^2` is the SNP-heritability, or the proportion of variation
       in the trait explained by the effects of variants included in the
       regression model above.
    *  :math:`M` is the number of variants used to estimate :math:`h_g^2`.
    *  :math:`N` is the number of samples in the underlying association study.

    For more details on the method implemented in this function, see:

    * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__

    Examples
    --------

    Run the method on a matrix table of summary statistics, where the rows
    are variants and the columns are different phenotypes:

    >>> mt_gwas = ld_score_all_phenos_sumstats
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=mt_gwas['ld_score'],
    ...     ld_score_expr=mt_gwas['ld_score'],
    ...     chi_sq_exprs=mt_gwas['chi_squared'],
    ...     n_samples_exprs=mt_gwas['n'])


    Run the method on a table with summary statistics for a single
    phenotype:

    >>> ht_gwas = ld_score_one_pheno_sumstats
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=ht_gwas['chi_squared_50_irnt'],
    ...     n_samples_exprs=ht_gwas['n_50_irnt'])

    Run the method on a table with summary statistics for multiple
    phenotypes:

    >>> ht_gwas = ld_score_one_pheno_sumstats
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'],
    ...                        ht_gwas['chi_squared_20160']],
    ...     n_samples_exprs=[ht_gwas['n_50_irnt'],
    ...                      ht_gwas['n_20160']])

    Notes
    -----
    The ``exprs`` provided as arguments to :func:`.ld_score_regression`
    must all be from the same object, either a :class:`Table` or a
    :class:`MatrixTable`.

    **If the arguments originate from a table:**

    *  The table must be keyed by fields ``locus`` of type
       :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of
       :py:data:`.tstr` elements.
    *  ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and
       ``n_samples_exprs`` are must be row-indexed fields.
    *  The number of expressions passed to ``n_samples_exprs`` must be
       equal to one or the number of expressions passed to
       ``chi_sq_exprs``. If just one expression is passed to
       ``n_samples_exprs``, that sample size expression is assumed to
       apply to all sets of statistics passed to ``chi_sq_exprs``.
       Otherwise, the expressions passed to ``chi_sq_exprs`` and
       ``n_samples_exprs`` are matched by index.
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have generic :obj:`int` values
       ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc.
       expressions passed to the ``chi_sq_exprs`` argument.

    **If the arguments originate from a matrix table:**

    *  The dimensions of the matrix table must be variants
       (rows) by phenotypes (columns).
    *  The rows of the matrix table must be keyed by fields
       ``locus`` of type :class:`.tlocus` and ``alleles``,
       a :py:data:`.tarray` of :py:data:`.tstr` elements.
    *  The columns of the matrix table must be keyed by a field
       of type :py:data:`.tstr` that uniquely identifies phenotypes
       represented in the matrix table. The column key must be a single
       expression; compound keys are not accepted.
    *  ``weight_expr`` and ``ld_score_expr`` must be row-indexed
       fields.
    *  ``chi_sq_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  ``n_samples_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have values corresponding to the
       column keys of the input matrix table.

    This function returns a :class:`Table` with one row per set of summary
    statistics passed to the ``chi_sq_exprs`` argument. The following
    row-indexed fields are included in the table:

    *  **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The
       returned table is keyed by this field. See the notes below for
       details on the possible values of this field.
    *  **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared
       test statistic for the given phenotype.
    *  **intercept** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          intercept :math:`1 + Na`.
       -  **standard_error**  (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    *  **snp_heritability** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          SNP-heritability :math:`h_g^2`.
       -  **standard_error** (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    Warning
    -------
    :func:`.ld_score_regression` considers only the rows for which both row
    fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing
    values in either field are removed prior to fitting the LD score
    regression model.

    Parameters
    ----------
    weight_expr : :class:`.Float64Expression`
                  Row-indexed expression for the LD scores used to derive
                  variant weights in the model.
    ld_score_expr : :class:`.Float64Expression`
                    Row-indexed expression for the LD scores used as covariates
                    in the model.
    chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of
                        :class:`.Float64Expression`
                        One or more row-indexed (if table) or entry-indexed
                        (if matrix table) expressions for chi-squared
                        statistics resulting from genome-wide association
                        studies.
    n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of
                     :class:`.NumericExpression`
                     One or more row-indexed (if table) or entry-indexed
                     (if matrix table) expressions indicating the number of
                     samples used in the studies that generated the test
                     statistics supplied to ``chi_sq_exprs``.
    n_blocks : :obj:`int`
               The number of blocks used in the jackknife approach to
               estimating standard errors.
    two_step_threshold : :obj:`int`
                         Variants with chi-squared statistics greater than this
                         value are excluded in the first step of the two-step
                         procedure used to fit the model.
    n_reference_panel_variants : :obj:`int`, optional
                                 Number of variants used to estimate the
                                 SNP-heritability :math:`h_g^2`.

    Returns
    -------
    :class:`.Table`
        Table keyed by ``phenotype`` with intercept and heritability estimates
        for each phenotype passed to the function."""

    chi_sq_exprs = wrap_to_list(chi_sq_exprs)
    n_samples_exprs = wrap_to_list(n_samples_exprs)

    assert ((len(chi_sq_exprs) == len(n_samples_exprs))
            or (len(n_samples_exprs) == 1))
    __k = 2  # number of covariates, including intercept

    ds = chi_sq_exprs[0]._indices.source

    analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices)
    analyze('ld_score_regression/ld_score_expr', ld_score_expr,
            ds._row_indices)

    # format input dataset
    if isinstance(ds, MatrixTable):
        if len(chi_sq_exprs) != 1:
            raise ValueError("""Only one chi_sq_expr allowed if originating
                from a matrix table.""")
        if len(n_samples_exprs) != 1:
            raise ValueError("""Only one n_samples_expr allowed if
                originating from a matrix table.""")

        col_key = list(ds.col_key)
        if len(col_key) != 1:
            raise ValueError("""Matrix table must be keyed by a single
                phenotype field.""")

        analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0],
                ds._entry_indices)
        analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0],
                ds._entry_indices)

        ds = ds._select_all(row_exprs={
            '__locus': ds.locus,
            '__alleles': ds.alleles,
            '__w_initial': weight_expr,
            '__w_initial_floor': hl.max(weight_expr, 1.0),
            '__x': ld_score_expr,
            '__x_floor': hl.max(ld_score_expr, 1.0)
        },
                            row_key=['__locus', '__alleles'],
                            col_exprs={'__y_name': ds[col_key[0]]},
                            col_key=['__y_name'],
                            entry_exprs={
                                '__y': chi_sq_exprs[0],
                                '__n': n_samples_exprs[0]
                            })
        ds = ds.annotate_entries(**{'__w': ds.__w_initial})

        ds = ds.filter_rows(
            hl.is_defined(ds.__locus)
            & hl.is_defined(ds.__alleles)
            & hl.is_defined(ds.__w_initial)
            & hl.is_defined(ds.__x))

    else:
        assert isinstance(ds, Table)
        for y in chi_sq_exprs:
            analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices)
        for n in n_samples_exprs:
            analyze('ld_score_regression/n_samples_expr', n, ds._row_indices)

        ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)]

        ds = ds.select(**dict(
            **{
                '__locus': ds.locus,
                '__alleles': ds.alleles,
                '__w_initial': weight_expr,
                '__x': ld_score_expr
            }, **{y: chi_sq_exprs[i]
                  for i, y in enumerate(ys)}, **{w: weight_expr
                                                 for w in ws}, **
            {n: n_samples_exprs[i]
             for i, n in enumerate(ns)}))
        ds = ds.key_by(ds.__locus, ds.__alleles)

        table_tmp_file = new_temp_file()
        ds.write(table_tmp_file)
        ds = hl.read_table(table_tmp_file)

        hts = [
            ds.select(
                **{
                    '__w_initial': ds.__w_initial,
                    '__w_initial_floor': hl.max(ds.__w_initial, 1.0),
                    '__x': ds.__x,
                    '__x_floor': hl.max(ds.__x, 1.0),
                    '__y_name': i,
                    '__y': ds[ys[i]],
                    '__w': ds[ws[i]],
                    '__n': hl.int(ds[ns[i]])
                }) for i, y in enumerate(ys)
        ]

        mts = [
            ht.to_matrix_table(row_key=['__locus', '__alleles'],
                               col_key=['__y_name'],
                               row_fields=[
                                   '__w_initial', '__w_initial_floor', '__x',
                                   '__x_floor'
                               ]) for ht in hts
        ]

        ds = mts[0]
        for i in range(1, len(ys)):
            ds = ds.union_cols(mts[i])

        ds = ds.filter_rows(
            hl.is_defined(ds.__locus)
            & hl.is_defined(ds.__alleles)
            & hl.is_defined(ds.__w_initial)
            & hl.is_defined(ds.__x))

    mt_tmp_file1 = new_temp_file()
    ds.write(mt_tmp_file1)
    mt = hl.read_matrix_table(mt_tmp_file1)

    if not n_reference_panel_variants:
        M = mt.count_rows()
    else:
        M = n_reference_panel_variants

    mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y)
                                         & (mt.__y < two_step_threshold)),
                             __in_step2=hl.is_defined(mt.__y))

    mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()),
                          __m_step1=hl.agg.count_where(mt.__in_step1),
                          __m_step2=hl.agg.count_where(mt.__in_step2))

    col_keys = list(mt.col_key)

    ht = mt.localize_entries(entries_array_field_name='__entries',
                             columns_array_field_name='__cols')

    ht = ht.annotate(__entries=hl.rbind(
        hl.scan.array_agg(lambda entry: hl.scan.count_where(entry.__in_step1),
                          ht.__entries),
        lambda step1_indices: hl.map(
            lambda i: hl.rbind(
                hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[
                    i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1,
                entry: hl.rbind(
                    hl.map(
                        lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))),
                        hl.range(0, n_blocks + 1)), lambda step1_separators: hl
                    .rbind(
                        hl.set(step1_separators).contains(step1_idx),
                        hl.sum(
                            hl.map(lambda s1: step1_idx >= s1, step1_separators
                                   )) - 1, lambda is_separator, step1_block:
                        entry.annotate(__step1_block=step1_block,
                                       __step2_block=hl.cond(
                                           ~entry.__in_step1 & is_separator,
                                           step1_block - 1, step1_block))))),
            hl.range(0, hl.len(ht.__entries)))))

    mt = ht._unlocalize_entries('__entries', '__cols', col_keys)

    mt_tmp_file2 = new_temp_file()
    mt.write(mt_tmp_file2)
    mt = hl.read_matrix_table(mt_tmp_file2)

    # initial coefficient estimates
    mt = mt.annotate_cols(__initial_betas=[
        1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)
    ])
    mt = mt.annotate_cols(__step1_betas=mt.__initial_betas,
                          __step2_betas=mt.__initial_betas)

    # step 1 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step1, 1.0 /
            (mt.__w_initial_floor * 2.0 *
             (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2),
            0.0))
        mt = mt.annotate_cols(__step1_betas=hl.agg.filter(
            mt.__in_step1,
            hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta))
        mt = mt.annotate_cols(__step1_h2=hl.max(
            hl.min(mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step1_betas=[
            mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M
        ])

    # step 1 block jackknife
    mt = mt.annotate_cols(__step1_block_betas=hl.agg.array_agg(
        lambda i: hl.agg.filter(
            (mt.__step1_block != i) & mt.__in_step1,
            hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta),
        hl.range(n_blocks)))

    mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x,
        mt.__step1_block_betas))

    mt = mt.annotate_cols(
        __step1_jackknife_mean=hl.map(
            lambda i: hl.mean(
                hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)),
            hl.range(0, __k)),
        __step1_jackknife_variance=hl.map(
            lambda i: (hl.sum(
                hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected
                       )) - hl.sum(
                           hl.map(lambda x: x[i], mt.
                                  __step1_block_betas_bias_corrected))**
                       2 / n_blocks) / (n_blocks - 1) / n_blocks,
            hl.range(0, __k)))

    # step 2 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step2, 1.0 /
            (mt.__w_initial_floor * 2.0 *
             (mt.__step2_betas[0] + +mt.__step2_betas[1] * mt.__x_floor)**2),
            0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0],
            hl.agg.filter(
                mt.__in_step2,
                hl.agg.linreg(
                    y=mt.__y -
                    mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0])
        ])
        mt = mt.annotate_cols(__step2_h2=hl.max(
            hl.min(mt.__step2_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n) / M
        ])

    # step 2 block jackknife
    mt = mt.annotate_cols(__step2_block_betas=hl.agg.array_agg(
        lambda i: hl.agg.filter((mt.__step2_block != i) & mt.__in_step2,
                                hl.agg.linreg(y=mt.__y - mt.__step1_betas[0],
                                              x=[mt.__x],
                                              weight=mt.__w).beta[0]),
        hl.range(n_blocks)))

    mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x,
        mt.__step2_block_betas))

    mt = mt.annotate_cols(
        __step2_jackknife_mean=hl.mean(mt.__step2_block_betas_bias_corrected),
        __step2_jackknife_variance=(
            hl.sum(mt.__step2_block_betas_bias_corrected**2) -
            hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) /
        (n_blocks - 1) / n_blocks)

    # combine step 1 and step 2 block jackknifes
    mt = mt.annotate_entries(
        __step2_initial_w=1.0 /
        (mt.__w_initial_floor * 2.0 *
         (mt.__initial_betas[0] + +mt.__initial_betas[1] * mt.__x_floor)**2))

    mt = mt.annotate_cols(
        __final_betas=[mt.__step1_betas[0], mt.__step2_betas[1]],
        __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) /
             hl.agg.sum(mt.__step2_initial_w * mt.__x**2)))

    mt = mt.annotate_cols(__final_block_betas=hl.map(
        lambda i: (mt.__step2_block_betas[i] - mt.__c *
                   (mt.__step1_block_betas[i][0] - mt.__final_betas[0])),
        hl.range(0, n_blocks)))

    mt = mt.annotate_cols(__final_block_betas_bias_corrected=(
        n_blocks * mt.__final_betas[1] -
        (n_blocks - 1) * mt.__final_block_betas))

    mt = mt.annotate_cols(
        __final_jackknife_mean=[
            mt.__step1_jackknife_mean[0],
            hl.mean(mt.__final_block_betas_bias_corrected)
        ],
        __final_jackknife_variance=[
            mt.__step1_jackknife_variance[0],
            (hl.sum(mt.__final_block_betas_bias_corrected**2) -
             hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) /
            (n_blocks - 1) / n_blocks
        ])

    # convert coefficient to heritability estimate
    mt = mt.annotate_cols(
        phenotype=mt.__y_name,
        mean_chi_sq=hl.agg.mean(mt.__y),
        intercept=hl.struct(estimate=mt.__final_betas[0],
                            standard_error=hl.sqrt(
                                mt.__final_jackknife_variance[0])),
        snp_heritability=hl.struct(
            estimate=(M / hl.agg.mean(mt.__n)) * mt.__final_betas[1],
            standard_error=hl.sqrt((M / hl.agg.mean(mt.__n))**2 *
                                   mt.__final_jackknife_variance[1])))

    # format and return results
    ht = mt.cols()
    ht = ht.key_by(ht.phenotype)
    ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability)

    ht_tmp_file = new_temp_file()
    ht.write(ht_tmp_file)
    ht = hl.read_table(ht_tmp_file)

    return ht
Пример #24
0
    'Siblings': 'sibling_ids',
    'Second Order': 'second_order_relationship_ids',
    'Third Order': 'third_order_relationship_ids',
    'Children': 'children_ids'
})
ht_relationships = ht_relationships.annotate(
    paternal_id=hl.or_missing(ht_relationships['paternal_id'] != '0',
                              ht_relationships['paternal_id']),
    maternal_id=hl.or_missing(ht_relationships['maternal_id'] != '0',
                              ht_relationships['maternal_id']),
    relationship_role=hl.cond(ht_relationships['relationship_role'] == 'unrel',
                              'unrelated',
                              ht_relationships['relationship_role']),
    sibling_ids=hl.or_missing(
        ht_relationships['sibling_ids'] == '0',
        hl.map(lambda x: x.strip(),
               ht_relationships['sibling_ids'].split(','))),
    children_ids=hl.or_missing(
        ht_relationships['children_ids'] == '0',
        hl.map(lambda x: x.strip(),
               ht_relationships['children_ids'].split(','))),
    second_order_relationship_ids=hl.or_missing(
        ht_relationships['second_order_relationship_ids'] == '0',
        hl.map(lambda x: x.strip(),
               ht_relationships['second_order_relationship_ids'].split(','))),
    third_order_relationship_ids=hl.or_missing(
        ht_relationships['third_order_relationship_ids'] == '0',
        hl.map(lambda x: x.strip(),
               ht_relationships['third_order_relationship_ids'].split(','))))
ht_relationships = ht_relationships.key_by('s')
ht_relationships = ht_relationships.select('family_id', 'relationship_role',
                                           'maternal_id', 'paternal_id',
Пример #25
0
def import_gtf(path, key=None):
    """Import a GTF file.

       The GTF file format is identical to the GFF version 2 file format,
       and so this function can be used to import GFF version 2 files as
       well.

       See https://www.ensembl.org/info/website/upload/gff.html for more
       details on the GTF/GFF2 file format.

       The :class:`.Table` returned by this function will include the following
       row fields:

       .. code-block:: text

           'seqname': str
           'source': str
           'feature': str
           'start': int32
           'end': int32
           'score': float64
           'strand': str
           'frame': int32

       There will also be corresponding fields for every tag found in the
       attribute field of the GTF file.

       .. note::

           The "end" field in the table will be incremented by 1 in
           comparison to the value found in the GTF file, as the end
           coordinate in a GTF file is inclusive while the end
           coordinate in Hail is exclusive.

       Example
       -------

       >>> ht = hl.experimental.import_gtf('data/test.gtf', key='gene_id')
       >>> ht.describe()

       .. code-block:: text

           ----------------------------------------
           Global fields:
           None
           ----------------------------------------
           Row fields:
               'seqname': str
               'source': str
               'feature': str
               'start': int32
               'end': int32
               'score': float64
               'strand': str
               'frame': int32
               'havana_gene': str
               'exon_id': str
               'havana_transcript': str
               'transcript_name': str
               'gene_type': str
               'tag': str
               'transcript_status': str
               'exon_number': str
               'level': str
               'transcript_id': str
               'transcript_type': str
               'gene_id': str
               'gene_name': str
               'gene_status': str
           ----------------------------------------
           Key: ['gene_id']
           ----------------------------------------

       Parameters
       ----------

       path : :obj:`str`
           File to import.
       key : :obj:`str` or :obj:`list` of :obj:`str`
           Key field(s). Can be tag name(s) found in the attribute field
           of the GTF file.

       Returns
       -------
       :class:`.Table`
       """

    ht = hl.import_table(path,
                         comment='#',
                         no_header=True,
                         types={'f3': hl.tint,
                                'f4': hl.tint,
                                'f5': hl.tfloat,
                                'f7': hl.tint},
                         missing='.',
                         delimiter='\t')

    ht = ht.rename({'f0': 'seqname',
                    'f1': 'source',
                    'f2': 'feature',
                    'f3': 'start',
                    'f4': 'end',
                    'f5': 'score',
                    'f6': 'strand',
                    'f7': 'frame',
                    'f8': 'attribute'})

    ht = ht.annotate(end=ht['end'] + 1)
    ht = ht.annotate(attribute=hl.dict(
        hl.map(lambda x: (x.split(' ')[0],
                          x.split(' ')[1].replace('"', '').replace(';$', '')),
               ht['attribute'].split('; '))))

    attributes = list(ht.aggregate(
        hl.set(hl.flatten(hl.agg.collect(ht['attribute'].keys())))))

    ht = ht.annotate(**{x: hl.or_missing(ht['attribute'].contains(x),
                                         ht['attribute'][x])
                        for x in attributes})

    ht = ht.drop(ht['attribute'])

    if key:
        key = wrap_to_list(key)
        ht = ht.key_by(*key)

    return ht
Пример #26
0
 def _coerce(self, x: Expression):
     assert isinstance(x, hl.expr.SetExpression)
     return hl.map(lambda x_: self.ec.coerce(x_), x)
Пример #27
0
def import_gtf(path,
               reference_genome=None,
               skip_invalid_contigs=False,
               min_partitions=None) -> hl.Table:
    """Import a GTF file.

       The GTF file format is identical to the GFF version 2 file format,
       and so this function can be used to import GFF version 2 files as
       well.

       See https://www.ensembl.org/info/website/upload/gff.html for more
       details on the GTF/GFF2 file format.

       The :class:`.Table` returned by this function will be keyed by the
       ``interval`` row field and will include the following row fields:

       .. code-block:: text

           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'interval': interval<>

       There will also be corresponding fields for every tag found in the
       attribute field of the GTF file.

       Note
       ----

       This function will return an ``interval`` field of type :class:`.tinterval`
       constructed from the ``seqname``, ``start``, and ``end`` fields in the
       GTF file. This interval is inclusive of both the start and end positions
       in the GTF file. 

       If the ``reference_genome`` parameter is specified, the start and end
       points of the ``interval`` field will be of type :class:`.tlocus`.
       Otherwise, the start and end points of the ``interval`` field will be of
       type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and
       ``position`` (type :class:`.tint32`).

       Furthermore, if the ``reference_genome`` parameter is specified and
       ``skip_invalid_contigs`` is ``True``, this import function will skip
       lines in the GTF where ``seqname`` is not consistent with the reference
       genome specified.

       Example
       -------

       >>> ht = hl.experimental.import_gtf('data/test.gtf', 
       ...                                 reference_genome='GRCh37',
       ...                                 skip_invalid_contigs=True)

       >>> ht.describe()  # doctest: +SKIP_OUTPUT_CHECK
       ----------------------------------------
       Global fields:
       None
       ----------------------------------------
       Row fields:
           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'gene_type': str
           'exon_id': str
           'havana_transcript': str
           'level': str
           'transcript_name': str
           'gene_status': str
           'gene_id': str
           'transcript_type': str
           'tag': str
           'transcript_status': str
           'gene_name': str
           'transcript_id': str
           'exon_number': str
           'havana_gene': str
           'interval': interval<locus<GRCh37>>
       ----------------------------------------
       Key: ['interval']
       ----------------------------------------

       Parameters
       ----------

       path : :obj:`str`
           File to import.
       reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional
           Reference genome to use.
       skip_invalid_contigs : :obj:`bool`
           If ``True`` and `reference_genome` is not ``None``, skip lines where
           ``seqname`` is not consistent with the reference genome.
       min_partitions : :obj:`int` or :obj:`None`
           Minimum number of partitions (passed to import_table).

       Returns
       -------
       :class:`.Table`
       """

    ht = hl.import_table(path,
                         min_partitions=min_partitions,
                         comment='#',
                         no_header=True,
                         types={
                             'f3': hl.tint,
                             'f4': hl.tint,
                             'f5': hl.tfloat,
                             'f7': hl.tint
                         },
                         missing='.',
                         delimiter='\t')

    ht = ht.rename({
        'f0': 'seqname',
        'f1': 'source',
        'f2': 'feature',
        'f3': 'start',
        'f4': 'end',
        'f5': 'score',
        'f6': 'strand',
        'f7': 'frame',
        'f8': 'attribute'
    })

    ht = ht.annotate(attribute=hl.dict(
        hl.map(
            lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').
                       replace(';$', '')), ht['attribute'].split('; '))))

    attributes = ht.aggregate(
        hl.agg.explode(lambda x: hl.agg.collect_as_set(x),
                       ht['attribute'].keys()))

    ht = ht.transmute(
        **{
            x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x])
            for x in attributes if x
        })

    if reference_genome:
        if reference_genome == 'GRCh37':
            ht = ht.annotate(seqname=ht['seqname'].replace('^chr', ''))
        else:
            ht = ht.annotate(seqname=hl.case().when(
                ht['seqname'].startswith('HLA'), ht['seqname']).when(
                    ht['seqname'].startswith('chrHLA'), ht['seqname'].replace(
                        '^chr', '')).when(ht['seqname'].startswith(
                            'chr'), ht['seqname']).default('chr' +
                                                           ht['seqname']))
        if skip_invalid_contigs:
            valid_contigs = hl.literal(
                set(hl.get_reference(reference_genome).contigs))
            ht = ht.filter(valid_contigs.contains(ht['seqname']))
        ht = ht.transmute(
            interval=hl.locus_interval(ht['seqname'],
                                       ht['start'],
                                       ht['end'],
                                       includes_start=True,
                                       includes_end=True,
                                       reference_genome=reference_genome))
    else:
        ht = ht.transmute(interval=hl.interval(
            hl.struct(seqname=ht['seqname'], position=ht['start']),
            hl.struct(seqname=ht['seqname'], position=ht['end']),
            includes_start=True,
            includes_end=True))

    ht = ht.key_by('interval')

    return ht
    'f3': 'start',
    'f4': 'end',
    'f5': 'score',
    'f6': 'strand',
    'f7': 'phase',
    'f8': 'attributes'
})

ht_transcripts = ht_transcripts.filter(
    ht_transcripts.feature_type == 'transcript')
ht_transcripts = ht_transcripts.annotate(interval=hl.interval(
    hl.locus(ht_transcripts.contig, ht_transcripts.start, 'GRCh37'),
    hl.locus(ht_transcripts.contig, ht_transcripts.end + 1, 'GRCh37')))
ht_transcripts = ht_transcripts.annotate(attributes=hl.dict(
    hl.map(
        lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(
            ';$', '')), ht_transcripts.attributes.split('; '))))
attribute_cols = list(
    ht_transcripts.aggregate(
        hl.set(hl.flatten(hl.agg.collect(ht_transcripts.attributes.keys())))))
ht_transcripts = ht_transcripts.annotate(
    **{
        x: hl.or_missing(ht_transcripts.attributes.contains(x),
                         ht_transcripts.attributes[x])
        for x in attribute_cols
    })
ht_transcripts = ht_transcripts.select(*([
    'transcript_id', 'transcript_name', 'transcript_type', 'strand',
    'transcript_status', 'havana_transcript', 'ccdsid', 'ont', 'gene_name',
    'interval', 'gene_type', 'annotation_source', 'havana_gene', 'gene_status',
    'tag'
ht_transcripts = hl.import_table('gs://hail-datasets/raw-data/gtex/v7/reference/gencode.v19.transcripts.patched_contigs.gtf',
                                 comment='#', no_header=True, types={'f3': hl.tint, 'f4': hl.tint}, missing='.', min_partitions=12)

ht_transcripts = ht_transcripts.rename({'f0': 'contig',
                                        'f1': 'annotation_source',
                                        'f2': 'feature_type',
                                        'f3': 'start',
                                        'f4': 'end',
                                        'f5': 'score',
                                        'f6': 'strand',
                                        'f7': 'phase',
                                        'f8': 'attributes'})

ht_transcripts = ht_transcripts.filter(ht_transcripts.feature_type == 'transcript')
ht_transcripts = ht_transcripts.annotate(interval=hl.interval(hl.locus(ht_transcripts.contig, ht_transcripts.start, 'GRCh37'), hl.locus(ht_transcripts.contig, ht_transcripts.end + 1, 'GRCh37')))
ht_transcripts = ht_transcripts.annotate(attributes=hl.dict(hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht_transcripts.attributes.split('; '))))
attribute_cols = list(ht_transcripts.aggregate(hl.set(hl.flatten(hl.agg.collect(ht_transcripts.attributes.keys())))))
ht_transcripts = ht_transcripts.annotate(**{x: hl.or_missing(ht_transcripts.attributes.contains(x), ht_transcripts.attributes[x]) for x in attribute_cols})
ht_transcripts = ht_transcripts.select(*(['transcript_id', 'transcript_name', 'transcript_type', 'strand', 'transcript_status', 'havana_transcript', 'ccdsid', 'ont', 'gene_name', 'interval', 'gene_type', 'annotation_source', 'havana_gene', 'gene_status', 'tag']))
ht_transcripts = ht_transcripts.rename({'havana_transcript': 'havana_transcript_id', 'havana_gene': 'havana_gene_id'})
ht_transcripts = ht_transcripts.key_by(ht_transcripts.transcript_id)

mt = hl.import_matrix_table('gs://hail-datasets/raw-data/gtex/v7/rna-seq/processed/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_expected_count.tsv.bgz',
                            row_fields={'transcript_id': hl.tstr, 'gene_id': hl.tstr}, row_key='transcript_id', missing='', entry_type=hl.tfloat)

mt = mt.annotate_cols(sample_id=mt.col_id)
mt = mt.key_cols_by(mt.sample_id)

mt = mt.annotate_entries(read_count=hl.int(mt.x))
mt = mt.drop(mt.col_id, mt.x)
Пример #30
0
                            get_xpos(rows.info.CHR2, rows.info.END2),
                            get_xpos(rows.locus.contig, rows.info.END)),
    'svType':
    lambda rows: rows.sv_type[0],
    'transcriptConsequenceTerms':
    lambda rows: [rows.sv_type[0]],
    'sv_type_detail':
    lambda rows: hl.if_else(
        rows.sv_type[0] == 'CPX', rows.info.CPX_TYPE,
        hl.if_else(
            (rows.sv_type[0] == 'INS') &
            (hl.len(rows.sv_type) > 1), rows.sv_type[1], hl.missing('str'))),
    'geneIds':
    lambda rows: hl.set(
        hl.map(
            lambda x: x.gene_id,
            rows.sortedTranscriptConsequences.filter(
                lambda x: x.predicted_consequence != 'NEAREST_TSS'))),
    'samples_no_call':
    lambda rows: get_sample_num_alt_x(rows, -1),
    'samples_num_alt_1':
    lambda rows: get_sample_num_alt_x(rows, 1),
    'samples_num_alt_2':
    lambda rows: get_sample_num_alt_x(rows, 2),
}

FIELDS = list(CORE_FIELDS.keys()) + list(DERIVED_FIELDS.keys()) + [
    'variantId', 'sortedTranscriptConsequences', 'genotypes'
]


def get_xpos(contig, pos):
Пример #31
0
def ld_score_regression(weight_expr,
                        ld_score_expr,
                        chi_sq_exprs,
                        n_samples_exprs,
                        n_blocks=200,
                        two_step_threshold=30,
                        n_reference_panel_variants=None) -> Table:
    r"""Estimate SNP-heritability and level of confounding biases from
    GWAS summary statistics.

    Given a set or multiple sets of genome-wide association study (GWAS)
    summary statistics, :func:`.ld_score_regression` estimates the heritability
    of a trait or set of traits and the level of confounding biases present in
    the underlying studies by regressing chi-squared statistics on LD scores,
    leveraging the model:

    .. math::

        \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j

    *  :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic
       for variant :math:`j` resulting from a test of association between
       variant :math:`j` and a trait.
    *  :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant
       :math:`j`, calculated as the sum of squared correlation coefficients
       between variant :math:`j` and nearby variants. See :func:`ld_score`
       for further details.
    *  :math:`a` captures the contribution of confounding biases, such as
       cryptic relatedness and uncontrolled population structure, to the
       association test statistic.
    *  :math:`h_g^2` is the SNP-heritability, or the proportion of variation
       in the trait explained by the effects of variants included in the
       regression model above.
    *  :math:`M` is the number of variants used to estimate :math:`h_g^2`.
    *  :math:`N` is the number of samples in the underlying association study.

    For more details on the method implemented in this function, see:

    * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__

    Examples
    --------

    Run the method on a matrix table of summary statistics, where the rows
    are variants and the columns are different phenotypes:

    >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=mt_gwas['ld_score'],
    ...     ld_score_expr=mt_gwas['ld_score'],
    ...     chi_sq_exprs=mt_gwas['chi_squared'],
    ...     n_samples_exprs=mt_gwas['n'])


    Run the method on a table with summary statistics for a single
    phenotype:

    >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=ht_gwas['chi_squared_50_irnt'],
    ...     n_samples_exprs=ht_gwas['n_50_irnt'])

    Run the method on a table with summary statistics for multiple
    phenotypes:

    >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'],
    ...                        ht_gwas['chi_squared_20160']],
    ...     n_samples_exprs=[ht_gwas['n_50_irnt'],
    ...                      ht_gwas['n_20160']])

    Notes
    -----
    The ``exprs`` provided as arguments to :func:`.ld_score_regression`
    must all be from the same object, either a :class:`Table` or a
    :class:`MatrixTable`.

    **If the arguments originate from a table:**

    *  The table must be keyed by fields ``locus`` of type
       :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of
       :py:data:`.tstr` elements.
    *  ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and
       ``n_samples_exprs`` are must be row-indexed fields.
    *  The number of expressions passed to ``n_samples_exprs`` must be
       equal to one or the number of expressions passed to
       ``chi_sq_exprs``. If just one expression is passed to
       ``n_samples_exprs``, that sample size expression is assumed to
       apply to all sets of statistics passed to ``chi_sq_exprs``.
       Otherwise, the expressions passed to ``chi_sq_exprs`` and
       ``n_samples_exprs`` are matched by index.
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have generic :obj:`int` values
       ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc.
       expressions passed to the ``chi_sq_exprs`` argument.

    **If the arguments originate from a matrix table:**

    *  The dimensions of the matrix table must be variants
       (rows) by phenotypes (columns).
    *  The rows of the matrix table must be keyed by fields
       ``locus`` of type :class:`.tlocus` and ``alleles``,
       a :py:data:`.tarray` of :py:data:`.tstr` elements.
    *  The columns of the matrix table must be keyed by a field
       of type :py:data:`.tstr` that uniquely identifies phenotypes
       represented in the matrix table. The column key must be a single
       expression; compound keys are not accepted.
    *  ``weight_expr`` and ``ld_score_expr`` must be row-indexed
       fields.
    *  ``chi_sq_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  ``n_samples_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have values corresponding to the
       column keys of the input matrix table.

    This function returns a :class:`Table` with one row per set of summary
    statistics passed to the ``chi_sq_exprs`` argument. The following
    row-indexed fields are included in the table:

    *  **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The
       returned table is keyed by this field. See the notes below for
       details on the possible values of this field.
    *  **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared
       test statistic for the given phenotype.
    *  **intercept** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          intercept :math:`1 + Na`.
       -  **standard_error**  (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    *  **snp_heritability** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          SNP-heritability :math:`h_g^2`.
       -  **standard_error** (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    Warning
    -------
    :func:`.ld_score_regression` considers only the rows for which both row
    fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing
    values in either field are removed prior to fitting the LD score
    regression model.

    Parameters
    ----------
    weight_expr : :class:`.Float64Expression`
                  Row-indexed expression for the LD scores used to derive
                  variant weights in the model.
    ld_score_expr : :class:`.Float64Expression`
                    Row-indexed expression for the LD scores used as covariates
                    in the model.
    chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of
                        :class:`.Float64Expression`
                        One or more row-indexed (if table) or entry-indexed
                        (if matrix table) expressions for chi-squared
                        statistics resulting from genome-wide association
                        studies.
    n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of
                     :class:`.NumericExpression`
                     One or more row-indexed (if table) or entry-indexed
                     (if matrix table) expressions indicating the number of
                     samples used in the studies that generated the test
                     statistics supplied to ``chi_sq_exprs``.
    n_blocks : :obj:`int`
               The number of blocks used in the jackknife approach to
               estimating standard errors.
    two_step_threshold : :obj:`int`
                         Variants with chi-squared statistics greater than this
                         value are excluded in the first step of the two-step
                         procedure used to fit the model.
    n_reference_panel_variants : :obj:`int`, optional
                                 Number of variants used to estimate the
                                 SNP-heritability :math:`h_g^2`.

    Returns
    -------
    :class:`.Table`
        Table keyed by ``phenotype`` with intercept and heritability estimates
        for each phenotype passed to the function."""

    chi_sq_exprs = wrap_to_list(chi_sq_exprs)
    n_samples_exprs = wrap_to_list(n_samples_exprs)

    assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or
            (len(n_samples_exprs) == 1))
    __k = 2  # number of covariates, including intercept

    ds = chi_sq_exprs[0]._indices.source

    analyze('ld_score_regression/weight_expr',
            weight_expr,
            ds._row_indices)
    analyze('ld_score_regression/ld_score_expr',
            ld_score_expr,
            ds._row_indices)

    # format input dataset
    if isinstance(ds, MatrixTable):
        if len(chi_sq_exprs) != 1:
            raise ValueError("""Only one chi_sq_expr allowed if originating
                from a matrix table.""")
        if len(n_samples_exprs) != 1:
            raise ValueError("""Only one n_samples_expr allowed if
                originating from a matrix table.""")

        col_key = list(ds.col_key)
        if len(col_key) != 1:
            raise ValueError("""Matrix table must be keyed by a single
                phenotype field.""")

        analyze('ld_score_regression/chi_squared_expr',
                chi_sq_exprs[0],
                ds._entry_indices)
        analyze('ld_score_regression/n_samples_expr',
                n_samples_exprs[0],
                ds._entry_indices)

        ds = ds._select_all(row_exprs={'__locus': ds.locus,
                                       '__alleles': ds.alleles,
                                       '__w_initial': weight_expr,
                                       '__w_initial_floor': hl.max(weight_expr,
                                                                   1.0),
                                       '__x': ld_score_expr,
                                       '__x_floor': hl.max(ld_score_expr,
                                                           1.0)},
                            row_key=['__locus', '__alleles'],
                            col_exprs={'__y_name': ds[col_key[0]]},
                            col_key=['__y_name'],
                            entry_exprs={'__y': chi_sq_exprs[0],
                                         '__n': n_samples_exprs[0]})
        ds = ds.annotate_entries(**{'__w': ds.__w_initial})

        ds = ds.filter_rows(hl.is_defined(ds.__locus) &
                            hl.is_defined(ds.__alleles) &
                            hl.is_defined(ds.__w_initial) &
                            hl.is_defined(ds.__x))

    else:
        assert isinstance(ds, Table)
        for y in chi_sq_exprs:
            analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices)
        for n in n_samples_exprs:
            analyze('ld_score_regression/n_samples_expr', n, ds._row_indices)

        ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)]

        ds = ds.select(**dict(**{'__locus': ds.locus,
                                 '__alleles': ds.alleles,
                                 '__w_initial': weight_expr,
                                 '__x': ld_score_expr},
                              **{y: chi_sq_exprs[i]
                                 for i, y in enumerate(ys)},
                              **{w: weight_expr for w in ws},
                              **{n: n_samples_exprs[i]
                                 for i, n in enumerate(ns)}))
        ds = ds.key_by(ds.__locus, ds.__alleles)

        table_tmp_file = new_temp_file()
        ds.write(table_tmp_file)
        ds = hl.read_table(table_tmp_file)

        hts = [ds.select(**{'__w_initial': ds.__w_initial,
                            '__w_initial_floor': hl.max(ds.__w_initial,
                                                        1.0),
                            '__x': ds.__x,
                            '__x_floor': hl.max(ds.__x, 1.0),
                            '__y_name': i,
                            '__y': ds[ys[i]],
                            '__w': ds[ws[i]],
                            '__n': hl.int(ds[ns[i]])})
               for i, y in enumerate(ys)]

        mts = [ht.to_matrix_table(row_key=['__locus',
                                           '__alleles'],
                                  col_key=['__y_name'],
                                  row_fields=['__w_initial',
                                              '__w_initial_floor',
                                              '__x',
                                              '__x_floor'])
               for ht in hts]

        ds = mts[0]
        for i in range(1, len(ys)):
            ds = ds.union_cols(mts[i])

        ds = ds.filter_rows(hl.is_defined(ds.__locus) &
                            hl.is_defined(ds.__alleles) &
                            hl.is_defined(ds.__w_initial) &
                            hl.is_defined(ds.__x))

    mt_tmp_file1 = new_temp_file()
    ds.write(mt_tmp_file1)
    mt = hl.read_matrix_table(mt_tmp_file1)

    if not n_reference_panel_variants:
        M = mt.count_rows()
    else:
        M = n_reference_panel_variants

    # block variants for each phenotype
    n_phenotypes = mt.count_cols()

    mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) &
                                         (mt.__y < two_step_threshold)),
                             __in_step2=hl.is_defined(mt.__y))

    mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()),
                          __m_step1=hl.agg.count_where(mt.__in_step1),
                          __m_step2=hl.agg.count_where(mt.__in_step2))

    col_keys = list(mt.col_key)

    ht = mt.localize_entries(entries_array_field_name='__entries',
                             columns_array_field_name='__cols')

    ht = ht.annotate(__entries=hl.rbind(
        hl.scan.array_agg(
            lambda entry: hl.scan.count_where(entry.__in_step1),
            ht.__entries),
        lambda step1_indices: hl.map(
            lambda i: hl.rbind(
                hl.int(hl.or_else(step1_indices[i], 0)),
                ht.__cols[i].__m_step1,
                ht.__entries[i],
                lambda step1_idx, m_step1, entry: hl.rbind(
                    hl.map(
                        lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))),
                        hl.range(0, n_blocks + 1)),
                    lambda step1_separators: hl.rbind(
                        hl.set(step1_separators).contains(step1_idx),
                        hl.sum(
                            hl.map(
                                lambda s1: step1_idx >= s1,
                                step1_separators)) - 1,
                        lambda is_separator, step1_block: entry.annotate(
                            __step1_block=step1_block,
                            __step2_block=hl.cond(~entry.__in_step1 & is_separator,
                                                  step1_block - 1,
                                                  step1_block))))),
            hl.range(0, hl.len(ht.__entries)))))

    mt = ht._unlocalize_entries('__entries', '__cols', col_keys)

    mt_tmp_file2 = new_temp_file()
    mt.write(mt_tmp_file2)
    mt = hl.read_matrix_table(mt_tmp_file2)
    
    # initial coefficient estimates
    mt = mt.annotate_cols(__initial_betas=[
        1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)])
    mt = mt.annotate_cols(__step1_betas=mt.__initial_betas,
                          __step2_betas=mt.__initial_betas)

    # step 1 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step1,
            1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] +
                                               mt.__step1_betas[1] *
                                               mt.__x_floor)**2),
            0.0))
        mt = mt.annotate_cols(__step1_betas=hl.agg.filter(
            mt.__in_step1,
            hl.agg.linreg(y=mt.__y,
                          x=[1.0, mt.__x],
                          weight=mt.__w).beta))
        mt = mt.annotate_cols(__step1_h2=hl.max(hl.min(
            mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step1_betas=[
            mt.__step1_betas[0],
            mt.__step1_h2 * hl.agg.mean(mt.__n) / M])

    # step 1 block jackknife
    mt = mt.annotate_cols(__step1_block_betas=[
        hl.agg.filter((mt.__step1_block != i) & mt.__in_step1,
                      hl.agg.linreg(y=mt.__y,
                                    x=[1.0, mt.__x],
                                    weight=mt.__w).beta)
        for i in range(n_blocks)])

    mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x,
        mt.__step1_block_betas))

    mt = mt.annotate_cols(
        __step1_jackknife_mean=hl.map(
            lambda i: hl.mean(
                hl.map(lambda x: x[i],
                       mt.__step1_block_betas_bias_corrected)),
            hl.range(0, __k)),
        __step1_jackknife_variance=hl.map(
            lambda i: (hl.sum(
                hl.map(lambda x: x[i]**2,
                       mt.__step1_block_betas_bias_corrected)) -
                       hl.sum(
                hl.map(lambda x: x[i],
                       mt.__step1_block_betas_bias_corrected))**2 /
                       n_blocks) /
            (n_blocks - 1) / n_blocks,
            hl.range(0, __k)))

    # step 2 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step2,
            1.0/(mt.__w_initial_floor *
                 2.0 * (mt.__step2_betas[0] +
                        mt.__step2_betas[1] *
                        mt.__x_floor)**2),
            0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0],
            hl.agg.filter(mt.__in_step2,
                          hl.agg.linreg(y=mt.__y - mt.__step1_betas[0],
                                        x=[mt.__x],
                                        weight=mt.__w).beta[0])])
        mt = mt.annotate_cols(__step2_h2=hl.max(hl.min(
            mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0],
            mt.__step2_h2 * hl.agg.mean(mt.__n)/M])

    # step 2 block jackknife
    mt = mt.annotate_cols(__step2_block_betas=[
        hl.agg.filter((mt.__step2_block != i) & mt.__in_step2,
                      hl.agg.linreg(y=mt.__y - mt.__step1_betas[0],
                                    x=[mt.__x],
                                    weight=mt.__w).beta[0])
        for i in range(n_blocks)])

    mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x,
        mt.__step2_block_betas))

    mt = mt.annotate_cols(
        __step2_jackknife_mean=hl.mean(
            mt.__step2_block_betas_bias_corrected),
        __step2_jackknife_variance=(
            hl.sum(mt.__step2_block_betas_bias_corrected**2) -
            hl.sum(mt.__step2_block_betas_bias_corrected)**2 /
            n_blocks) / (n_blocks - 1) / n_blocks)

    # combine step 1 and step 2 block jackknifes
    mt = mt.annotate_entries(
        __step2_initial_w=1.0/(mt.__w_initial_floor *
                               2.0 * (mt.__initial_betas[0] +
                                      mt.__initial_betas[1] *
                                      mt.__x_floor)**2))

    mt = mt.annotate_cols(
        __final_betas=[
            mt.__step1_betas[0],
            mt.__step2_betas[1]],
        __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) /
             hl.agg.sum(mt.__step2_initial_w * mt.__x**2)))

    mt = mt.annotate_cols(__final_block_betas=hl.map(
        lambda i: (mt.__step2_block_betas[i] - mt.__c *
                   (mt.__step1_block_betas[i][0] - mt.__final_betas[0])),
        hl.range(0, n_blocks)))

    mt = mt.annotate_cols(
        __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] -
                                            (n_blocks - 1) *
                                            mt.__final_block_betas))

    mt = mt.annotate_cols(
        __final_jackknife_mean=[
            mt.__step1_jackknife_mean[0],
            hl.mean(mt.__final_block_betas_bias_corrected)],
        __final_jackknife_variance=[
            mt.__step1_jackknife_variance[0],
            (hl.sum(mt.__final_block_betas_bias_corrected**2) -
             hl.sum(mt.__final_block_betas_bias_corrected)**2 /
             n_blocks) / (n_blocks - 1) / n_blocks])

    # convert coefficient to heritability estimate
    mt = mt.annotate_cols(
        phenotype=mt.__y_name,
        mean_chi_sq=hl.agg.mean(mt.__y),
        intercept=hl.struct(
            estimate=mt.__final_betas[0],
            standard_error=hl.sqrt(mt.__final_jackknife_variance[0])),
        snp_heritability=hl.struct(
            estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1],
            standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 *
                                   mt.__final_jackknife_variance[1])))

    # format and return results
    ht = mt.cols()
    ht = ht.key_by(ht.phenotype)
    ht = ht.select(ht.mean_chi_sq,
                   ht.intercept,
                   ht.snp_heritability)

    ht_tmp_file = new_temp_file()
    ht.write(ht_tmp_file)
    ht = hl.read_table(ht_tmp_file)
    
    return ht
Пример #32
0
    def export(self, path, delimiter='\t', missing='NA', header=True):
        """Export a field to a text file.

        Examples
        --------

        >>> small_mt.GT.export('output/gt.tsv')
        >>> with open('output/gt.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        locus	alleles	0	1	2	3
        1:1	["A","C"]	0/1	0/1	0/0	0/0
        1:2	["A","C"]	1/1	0/1	1/1	1/1
        1:3	["A","C"]	1/1	0/1	0/1	0/0
        1:4	["A","C"]	1/1	0/1	1/1	1/1
        <BLANKLINE>

        >>> small_mt.GT.export('output/gt-no-header.tsv', header=False)
        >>> with open('output/gt-no-header.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        1:1	["A","C"]	0/1	0/1	0/0	0/0
        1:2	["A","C"]	1/1	0/1	1/1	1/1
        1:3	["A","C"]	1/1	0/1	0/1	0/0
        1:4	["A","C"]	1/1	0/1	1/1	1/1
        <BLANKLINE>

        >>> small_mt.pop.export('output/pops.tsv')
        >>> with open('output/pops.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        sample_idx	pop
        0	2
        1	2
        2	0
        3	2
        <BLANKLINE>

        >>> small_mt.ancestral_af.export('output/ancestral_af.tsv')
        >>> with open('output/ancestral_af.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        locus	alleles	ancestral_af
        1:1	["A","C"]	5.3905e-01
        1:2	["A","C"]	8.6768e-01
        1:3	["A","C"]	4.3765e-01
        1:4	["A","C"]	7.6300e-01
        <BLANKLINE>

        >>> mt = small_mt
        >>> small_mt.bn.export('output/bn.tsv')
        >>> with open('output/bn.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        bn
        {"n_populations":3,"n_samples":4,"n_variants":4,"n_partitions":8,"pop_dist":[1,1,1],"fst":[0.1,0.1,0.1],"mixture":false}
        <BLANKLINE>


        Notes
        -----

        For entry-indexed expressions, if there is one column key field, the
        result of calling :func:`~hail.expr.functions.str` on that field is used as
        the column header. Otherwise, each compound column key is converted to
        JSON and used as a column header. For example:

        >>> small_mt = small_mt.key_cols_by(s=small_mt.sample_idx, family='fam1')
        >>> small_mt.GT.export('output/gt-no-header.tsv')
        >>> with open('output/gt-no-header.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        locus	alleles	{"s":0,"family":"fam1"}	{"s":1,"family":"fam1"}	{"s":2,"family":"fam1"}	{"s":3,"family":"fam1"}
        1:1	["A","C"]	0/1	0/1	0/0	0/0
        1:2	["A","C"]	1/1	0/1	1/1	1/1
        1:3	["A","C"]	1/1	0/1	0/1	0/0
        1:4	["A","C"]	1/1	0/1	1/1	1/1
        <BLANKLINE>


        Parameters
        ----------
        path : :class:`str`
            The path to which to export.
        delimiter : :class:`str`
            The string for delimiting columns.
        missing : :class:`str`
            The string to output for missing values.
        header : :obj:`bool`
            When ``True`` include a header line.
        """
        uid = Env.get_uid()
        self_name, ds = self._to_relational_preserving_rows_and_cols(uid)
        if isinstance(ds, hl.Table):
            ds.export(output=path, delimiter=delimiter, header=header)
        else:
            assert len(self._indices.axes) == 2
            entries, cols = Env.get_uid(), Env.get_uid()
            t = ds.select_cols().localize_entries(entries, cols)
            t = t.order_by(*t.key)
            output_col_name = Env.get_uid()
            entry_array = t[entries]
            if self_name:
                entry_array = hl.map(lambda x: x[self_name], entry_array)
            entry_array = hl.map(
                lambda x: hl.if_else(hl.is_missing(x), missing, hl.str(x)),
                entry_array)
            file_contents = t.select(
                **{k: hl.str(t[k])
                   for k in ds.row_key},
                **{output_col_name: hl.delimit(entry_array, delimiter)})
            if header:
                col_key = t[cols]
                if len(ds.col_key) == 1:
                    col_key = hl.map(lambda x: x[0], col_key)
                column_names = hl.map(hl.str,
                                      col_key).collect(_localize=False)[0]
                header_table = hl.utils.range_table(1).key_by().select(
                    **{k: k
                       for k in ds.row_key},
                    **{output_col_name: hl.delimit(column_names, delimiter)})
                file_contents = header_table.union(file_contents)
            file_contents.export(path, delimiter=delimiter, header=False)
Пример #33
0
 def _coerce(self, x: Expression):
     assert isinstance(x, hl.expr.SetExpression)
     return hl.map(lambda x_: self.ec.coerce(x_), x)
Пример #34
0
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table:
    """Import a GTF file.

       The GTF file format is identical to the GFF version 2 file format,
       and so this function can be used to import GFF version 2 files as
       well.

       See https://www.ensembl.org/info/website/upload/gff.html for more
       details on the GTF/GFF2 file format.

       The :class:`.Table` returned by this function will be keyed by the
       ``interval`` row field and will include the following row fields:

       .. code-block:: text

           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'interval': interval<>

       There will also be corresponding fields for every tag found in the
       attribute field of the GTF file.

       Note
       ----

       This function will return an ``interval`` field of type :class:`.tinterval`
       constructed from the ``seqname``, ``start``, and ``end`` fields in the
       GTF file. This interval is inclusive of both the start and end positions
       in the GTF file. 

       If the ``reference_genome`` parameter is specified, the start and end
       points of the ``interval`` field will be of type :class:`.tlocus`.
       Otherwise, the start and end points of the ``interval`` field will be of
       type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and
       ``position`` (type :class:`.tint32`).

       Furthermore, if the ``reference_genome`` parameter is specified and
       ``skip_invalid_contigs`` is ``True``, this import function will skip
       lines in the GTF where ``seqname`` is not consistent with the reference
       genome specified.

       Example
       -------

       >>> ht = hl.experimental.import_gtf('data/test.gtf', 
       ...                                 reference_genome='GRCh37',
       ...                                 skip_invalid_contigs=True)

       >>> ht.describe()  # doctest: +NOTEST
       ----------------------------------------
       Global fields:
       None
       ----------------------------------------
       Row fields:
           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'gene_type': str
           'exon_id': str
           'havana_transcript': str
           'level': str
           'transcript_name': str
           'gene_status': str
           'gene_id': str
           'transcript_type': str
           'tag': str
           'transcript_status': str
           'gene_name': str
           'transcript_id': str
           'exon_number': str
           'havana_gene': str
           'interval': interval<locus<GRCh37>>
       ----------------------------------------
       Key: ['interval']
       ----------------------------------------

       Parameters
       ----------

       path : :obj:`str`
           File to import.
       reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional
           Reference genome to use.
       skip_invalid_contigs : :obj:`bool`
           If ``True`` and `reference_genome` is not ``None``, skip lines where
           ``seqname`` is not consistent with the reference genome.
       min_partitions : :obj:`int` or :obj:`None`
           Minimum number of partitions (passed to import_table).

       Returns
       -------
       :class:`.Table`
       """

    ht = hl.import_table(path,
                         min_partitions=min_partitions,
                         comment='#',
                         no_header=True,
                         types={'f3': hl.tint,
                                'f4': hl.tint,
                                'f5': hl.tfloat,
                                'f7': hl.tint},
                         missing='.',
                         delimiter='\t')

    ht = ht.rename({'f0': 'seqname',
                    'f1': 'source',
                    'f2': 'feature',
                    'f3': 'start',
                    'f4': 'end',
                    'f5': 'score',
                    'f6': 'strand',
                    'f7': 'frame',
                    'f8': 'attribute'})

    ht = ht.annotate(attribute=hl.dict(
        hl.map(lambda x: (x.split(' ')[0],
                          x.split(' ')[1].replace('"', '').replace(';$', '')),
               ht['attribute'].split('; '))))

    attributes = ht.aggregate(hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys()))

    ht = ht.transmute(**{x: hl.or_missing(ht['attribute'].contains(x),
                                          ht['attribute'][x])
                         for x in attributes if x})

    if reference_genome:
        if reference_genome == 'GRCh37':
            ht = ht.annotate(seqname=ht['seqname'].replace('^chr', ''))
        else:
            ht = ht.annotate(seqname=hl.case()
                                       .when(ht['seqname'].startswith('HLA'), ht['seqname'])
                                       .when(ht['seqname'].startswith('chrHLA'), ht['seqname'].replace('^chr', ''))
                                       .when(ht['seqname'].startswith('chr'), ht['seqname'])
                                       .default('chr' + ht['seqname']))
        if skip_invalid_contigs:
            valid_contigs = hl.literal(set(hl.get_reference(reference_genome).contigs))
            ht = ht.filter(valid_contigs.contains(ht['seqname']))
        ht = ht.transmute(interval=hl.locus_interval(ht['seqname'],
                                                     ht['start'],
                                                     ht['end'],
                                                     includes_start=True,
                                                     includes_end=True,
                                                     reference_genome=reference_genome))
    else:
        ht = ht.transmute(interval=hl.interval(hl.struct(seqname=ht['seqname'], position=ht['start']),
                                               hl.struct(seqname=ht['seqname'], position=ht['end']),
                                               includes_start=True,
                                               includes_end=True))

    ht = ht.key_by('interval')

    return ht
Пример #35
0
                'SMRRNANM',
                'SMVQCFL',
                'SMTRSCPT',
                'SMMPPDPR',
                'SMCGLGTH',
                'SMUNPDRD',
                'SMMPPDUN',
                'SME2ANTI',
                'SMALTALG',
                'SME2SNSE',
                'SMMFLGTH',
                'SMSPLTRD',
                'SME1ANTI',
                'SME1SNSE',
                'SMNUM5CD']

    ht_samples = ht_samples.annotate(**{x: hl.float(ht_samples[x]) for x in float_cols})
    ht_samples = ht_samples.annotate(**{x: hl.int(ht_samples[x].replace('.0$', '')) for x in int_cols})

    ht = ht.filter(ht.feature_type == 'gene')
    ht = ht.annotate(interval=hl.interval(hl.locus(ht['contig'], ht['start'], 'GRCh37'), hl.locus(ht['contig'], ht['end'] + 1, 'GRCh37')))
    ht = ht.annotate(attributes=hl.dict(hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht['attributes'].split('; '))))
    attribute_cols = list(ht.aggregate(hl.set(hl.flatten(hl.agg.collect(ht.attributes.keys())))))
    ht = ht.annotate(**{x: hl.or_missing(ht_genes.attributes.contains(x), ht_genes.attributes[x]) for x in attribute_cols})
    ht = ht.select(*(['gene_id', 'interval', 'gene_type', 'strand', 'annotation_source', 'havana_gene', 'gene_status', 'tag']))
    ht = ht.rename({'havana_gene': 'havana_gene_id'})
    ht = ht.key_by(ht_genes.gene_id)

"""