예제 #1
0
파일: misc.py 프로젝트: danking/hail
def rename_duplicates(dataset, name='unique_id') -> MatrixTable:
    """Rename duplicate column keys.

    .. include:: ../_templates/req_tstring.rst

    Examples
    --------

    >>> renamed = hl.rename_duplicates(dataset).cols()
    >>> duplicate_samples = (renamed.filter(renamed.s != renamed.unique_id)
    ...                             .select()
    ...                             .collect())

    Notes
    -----

    This method produces a new column field from the string column key by
    appending a unique suffix ``_N`` as necessary. For example, if the column
    key "NA12878" appears three times in the dataset, the first will produce
    "NA12878", the second will produce "NA12878_1", and the third will produce
    "NA12878_2". The name of this new field is parameterized by `name`.

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name of new field.

    Returns
    -------
    :class:`.MatrixTable`
    """

    return MatrixTable._from_java(dataset._jmt.renameDuplicates(name))
예제 #2
0
def rename_duplicates(dataset, name='unique_id') -> MatrixTable:
    """Rename duplicate column keys.

    .. include:: ../_templates/req_tstring.rst

    Examples
    --------

    >>> renamed = hl.rename_duplicates(dataset).cols()
    >>> duplicate_samples = (renamed.filter(renamed.s != renamed.unique_id)
    ...                             .select()
    ...                             .collect())

    Notes
    -----

    This method produces a new column field from the string column key by
    appending a unique suffix ``_N`` as necessary. For example, if the column
    key "NA12878" appears three times in the dataset, the first will produce
    "NA12878", the second will produce "NA12878_1", and the third will produce
    "NA12878_2". The name of this new field is parameterized by `name`.

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name of new field.

    Returns
    -------
    :class:`.MatrixTable`
    """

    return MatrixTable._from_java(dataset._jmt.renameDuplicates(name))
예제 #3
0
    def read_multiple_matrix_tables(self, paths: 'List[str]', intervals: 'List[hl.Interval]', intervals_type):
        json_repr = {
            'paths': paths,
            'intervals': intervals_type._convert_to_json(intervals),
            'intervalPointType': intervals_type.element_type.point_type._parsable_string(),
        }

        results = self._jhc.backend().pyReadMultipleMatrixTables(json.dumps(json_repr))
        return [MatrixTable._from_java(jm) for jm in results]
예제 #4
0
def trio_matrix(dataset, pedigree, complete_trios=False) -> MatrixTable:
    """Builds and returns a matrix where columns correspond to trios and entries contain genotypes for the trio.

    .. include:: ../_templates/req_tstring.rst

    Examples
    --------

    Create a trio matrix:

    >>> pedigree = hl.Pedigree.read('data/case_control_study.fam')
    >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True)

    Notes
    -----

    This method builds a new matrix table with one column per trio. If
    `complete_trios` is ``True``, then only trios that satisfy
    :meth:`.Trio.is_complete` are included. In this new dataset, the column
    identifiers are the sample IDs of the trio probands. The column fields and
    entries of the matrix are changed in the following ways:

    The new column fields consist of three structs (`proband`, `father`,
    `mother`), a Boolean field, and a string field:

    - **proband** (:class:`.tstruct`) - Column fields on the proband.
    - **father** (:class:`.tstruct`) - Column fields on the father.
    - **mother** (:class:`.tstruct`) - Column fields on the mother.
    - **id** (:py:data:`.tstr`) - Column key for the proband.
    - **is_female** (:py:data:`.tbool`) - Proband is female.
      ``True`` for female, ``False`` for male, missing if unknown.
    - **fam_id** (:py:data:`.tstr`) - Family ID.

    The new entry fields are:

    - **proband_entry** (:class:`.tstruct`) - Proband entry fields.
    - **father_entry** (:class:`.tstruct`) - Father entry fields.
    - **mother_entry** (:class:`.tstruct`) - Mother entry fields.

    Parameters
    ----------
    pedigree : :class:`.Pedigree`

    Returns
    -------
    :class:`.MatrixTable`
    """
    return MatrixTable._from_java(
        dataset._jmt.trioMatrix(pedigree._jrep, complete_trios))
예제 #5
0
파일: misc.py 프로젝트: danking/hail
def window_by_locus(mt: MatrixTable, bp_window_size: int) -> MatrixTable:
    """Collect arrays of row and entry values from preceding loci.

    .. include:: ../_templates/req_tlocus.rst

    .. include:: ../_templates/experimental.rst

    Examples
    --------
    >>> ds_result = hl.window_by_locus(ds, 3)

    Notes
    -----
    This method groups each row (variant) with the previous rows in a window of
    `bp_window_size` base pairs, putting the row values from the previous
    variants into `prev_rows` (row field of type ``array<struct>``) and entry
    values from those variants into `prev_entries` (entry field of type
    ``array<struct>``).

    The `bp_window_size` argument is inclusive; if `base_pairs` is 2 and the
    loci are

    .. code-block:: text

        1:100
        1:100
        1:102
        1:102
        1:103
        2:100
        2:101

    then the size of `prev_rows` is 0, 1, 2, 3, 2, 0, and 1, respectively (and
    same for the size of prev_entries).

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Input dataset.
    bp_window_size : :obj:`int`
        Base pairs to include in the backwards window (inclusive).

    Returns
    -------
    :class:`.MatrixTable`
    """
    require_first_key_field_locus(mt, 'window_by_locus')
    return MatrixTable._from_java(mt._jmt.windowVariants(bp_window_size))
예제 #6
0
def trio_matrix(dataset, pedigree, complete_trios=False) -> MatrixTable:
    """Builds and returns a matrix where columns correspond to trios and entries contain genotypes for the trio.

    .. include:: ../_templates/req_tstring.rst

    Examples
    --------

    Create a trio matrix:

    >>> pedigree = hl.Pedigree.read('data/case_control_study.fam')
    >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True)

    Notes
    -----

    This method builds a new matrix table with one column per trio. If
    `complete_trios` is ``True``, then only trios that satisfy
    :meth:`.Trio.is_complete` are included. In this new dataset, the column
    identifiers are the sample IDs of the trio probands. The column fields and
    entries of the matrix are changed in the following ways:

    The new column fields consist of three structs (`proband`, `father`,
    `mother`), a Boolean field, and a string field:

    - **proband** (:class:`.tstruct`) - Column fields on the proband.
    - **father** (:class:`.tstruct`) - Column fields on the father.
    - **mother** (:class:`.tstruct`) - Column fields on the mother.
    - **id** (:py:data:`.tstr`) - Column key for the proband.
    - **is_female** (:py:data:`.tbool`) - Proband is female.
      ``True`` for female, ``False`` for male, missing if unknown.
    - **fam_id** (:py:data:`.tstr`) - Family ID.

    The new entry fields are:

    - **proband_entry** (:class:`.tstruct`) - Proband entry fields.
    - **father_entry** (:class:`.tstruct`) - Father entry fields.
    - **mother_entry** (:class:`.tstruct`) - Mother entry fields.

    Parameters
    ----------
    pedigree : :class:`.Pedigree`

    Returns
    -------
    :class:`.MatrixTable`
    """
    return MatrixTable._from_java(dataset._jmt.trioMatrix(pedigree._jrep, complete_trios))
예제 #7
0
def window_by_locus(mt: MatrixTable, bp_window_size: int) -> MatrixTable:
    """Collect arrays of row and entry values from preceding loci.

    .. include:: ../_templates/req_tlocus.rst

    .. include:: ../_templates/experimental.rst

    Examples
    --------
    >>> ds_result = hl.window_by_locus(ds, 3)

    Notes
    -----
    This method groups each row (variant) with the previous rows in a window of
    `bp_window_size` base pairs, putting the row values from the previous
    variants into `prev_rows` (row field of type ``array<struct>``) and entry
    values from those variants into `prev_entries` (entry field of type
    ``array<struct>``).

    The `bp_window_size` argument is inclusive; if `base_pairs` is 2 and the
    loci are

    .. code-block:: text

        1:100
        1:100
        1:102
        1:102
        1:103
        2:100
        2:101

    then the size of `prev_rows` is 0, 1, 2, 3, 2, 0, and 1, respectively (and
    same for the size of prev_entries).

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Input dataset.
    bp_window_size : :obj:`int`
        Base pairs to include in the backwards window (inclusive).

    Returns
    -------
    :class:`.MatrixTable`
    """
    require_first_key_field_locus(mt, 'window_by_locus')
    return MatrixTable._from_java(mt._jmt.windowVariants(bp_window_size))
예제 #8
0
파일: backend.py 프로젝트: shutianxu/hail
 def unpersist_matrix_table(self, mt):
     return MatrixTable._from_java(self._to_java_ir(mt._mir).pyUnpersist())
예제 #9
0
파일: backend.py 프로젝트: shutianxu/hail
 def persist_matrix_table(self, mt, storage_level):
     return MatrixTable._from_java(self._to_java_ir(mt._mir).pyPersist(storage_level))
예제 #10
0
파일: backend.py 프로젝트: Quiltomics/hail
 def unpersist_matrix_table(self, mt, storage_level):
     return MatrixTable._from_java(mt._jmt.unpersist())
예제 #11
0
 def persist_matrix_table(self, mt, storage_level):
     return MatrixTable._from_java(
         self._jbackend.pyPersistMatrix(storage_level,
                                        self._to_java_matrix_ir(mt._mir)))
예제 #12
0
파일: backend.py 프로젝트: tpoterba/hail
 def unpersist_matrix_table(self, mt):
     return MatrixTable._from_java(self._to_java_ir(mt._mir).pyUnpersist())
예제 #13
0
파일: backend.py 프로젝트: tpoterba/hail
 def persist_matrix_table(self, mt, storage_level):
     return MatrixTable._from_java(self._to_java_ir(mt._mir).pyPersist(storage_level))
예제 #14
0
 def unpersist_matrix_table(self, mt):
     return MatrixTable._from_java(mt._jmt.unpersist())
예제 #15
0
파일: misc.py 프로젝트: danking/hail
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]:
    """Filter rows with a list of intervals.

    Examples
    --------

    Filter to loci falling within one interval:

    >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')])

    Remove all loci within list of intervals:

    >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
    >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False)

    Notes
    -----
    Based on the ``keep`` argument, this method will either restrict to points
    in the supplied interval ranges, or remove all rows in those ranges.

    When ``keep=True``, partitions that don't overlap any supplied interval
    will not be loaded at all.  This enables :func:`.filter_intervals` to be
    used for reasonably low-latency queries of small ranges of the dataset, even
    on large datasets.

    Parameters
    ----------
    ds : :class:`.MatrixTable` or :class:`.Table`
        Dataset to filter.
    intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval`
        Intervals to filter on.  The point type of the interval must
        be a prefix of the key or equal to the first field of the key.
    keep : :obj:`bool`
        If ``True``, keep only rows that fall within any interval in `intervals`.
        If ``False``, keep only rows that fall outside all intervals in
        `intervals`.

    Returns
    -------
    :class:`.MatrixTable` or :class:`.Table`

    """

    if isinstance(ds, MatrixTable):
        k_type = ds.row_key.dtype
    else:
        assert isinstance(ds, Table)
        k_type = ds.key.dtype

    point_type = intervals.dtype.element_type.point_type

    def is_struct_prefix(partial, full):
        if list(partial) != list(full)[:len(partial)]:
            return False
        for k, v in partial.items():
            if full[k] != v:
                return False
        return True

    if point_type == k_type[0]:
        needs_wrapper = True
    elif isinstance(point_type, tstruct) and is_struct_prefix(point_type, k_type):
        needs_wrapper = False
    else:
        raise TypeError("The point type is incompatible with key type of the dataset ('{}', '{}')".format(repr(point_type), repr(k_type)))

    def wrap_input(interval):
        if interval is None:
            raise TypeError("'filter_intervals' does not allow missing values in 'intervals'.")
        elif needs_wrapper:
            return Interval(Struct(foo=interval.start),
                            Struct(foo=interval.end),
                            interval.includes_start,
                            interval.includes_end)
        else:
            return interval

    intervals = [wrap_input(x)._jrep for x in hl.eval(intervals)]
    if isinstance(ds, MatrixTable):
        jmt = Env.hail().methods.MatrixFilterIntervals.apply(ds._jmt, intervals, keep)
        return MatrixTable._from_java(jmt)
    else:
        jt = Env.hail().methods.TableFilterIntervals.apply(ds._jt, intervals, keep)
        return Table._from_java(jt)
예제 #16
0
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]:
    """Filter rows with a list of intervals.

    Examples
    --------

    Filter to loci falling within one interval:

    >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')])

    Remove all loci within list of intervals:

    >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
    >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False)

    Notes
    -----
    Based on the ``keep`` argument, this method will either restrict to points
    in the supplied interval ranges, or remove all rows in those ranges.

    When ``keep=True``, partitions that don't overlap any supplied interval
    will not be loaded at all.  This enables :func:`.filter_intervals` to be
    used for reasonably low-latency queries of small ranges of the dataset, even
    on large datasets.

    Parameters
    ----------
    ds : :class:`.MatrixTable` or :class:`.Table`
        Dataset to filter.
    intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval`
        Intervals to filter on.  The point type of the interval must
        be a prefix of the key or equal to the first field of the key.
    keep : :obj:`bool`
        If ``True``, keep only rows that fall within any interval in `intervals`.
        If ``False``, keep only rows that fall outside all intervals in
        `intervals`.

    Returns
    -------
    :class:`.MatrixTable` or :class:`.Table`

    """

    if isinstance(ds, MatrixTable):
        k_type = ds.row_key.dtype
    else:
        assert isinstance(ds, Table)
        k_type = ds.key.dtype

    point_type = intervals.dtype.element_type.point_type

    def is_struct_prefix(partial, full):
        if list(partial) != list(full)[:len(partial)]:
            return False
        for k, v in partial.items():
            if full[k] != v:
                return False
        return True

    if point_type == k_type[0]:
        needs_wrapper = True
    elif isinstance(point_type, tstruct) and is_struct_prefix(
            point_type, k_type):
        needs_wrapper = False
    else:
        raise TypeError(
            "The point type is incompatible with key type of the dataset ('{}', '{}')"
            .format(repr(point_type), repr(k_type)))

    def wrap_input(interval):
        if interval is None:
            raise TypeError(
                "'filter_intervals' does not allow missing values in 'intervals'."
            )
        elif needs_wrapper:
            return Interval(Struct(foo=interval.start),
                            Struct(foo=interval.end), interval.includes_start,
                            interval.includes_end)
        else:
            return interval

    intervals = [wrap_input(x)._jrep for x in hl.eval(intervals)]
    if isinstance(ds, MatrixTable):
        jmt = Env.hail().methods.MatrixFilterIntervals.apply(
            ds._jmt, intervals, keep)
        return MatrixTable._from_java(jmt)
    else:
        jt = Env.hail().methods.TableFilterIntervals.apply(
            ds._jt, intervals, keep)
        return Table._from_java(jt)