Exemplo n.º 1
0
def impute_mean(ds: Dataset, dim: str, variable='data'):
    """Mean impute variable in a dataset
    
    Parameters
    ----------
    ds : Dataset
        Dataset with some variable to impute.
        When missing values are present, this **must** contain
        an `is_masked` variable (otherwise nothing will 
        happen).
    dim : str
        Dimension over which means should be computed.
        For example, in a (variants, samples) data array,
        `dim='variants'` means that means are computed
        for each sample and missing values are replaced
        with that associated mean (per-variant).
    variable : str
        Variable in `ds` to impute

    Returns
    -------
    Dataset
        Dataset with `variable` imputed and `is_masked` dropped.  
        Note that this often leads to a type change (e.g. int8 -> float64)
    """
    if not 'is_masked' in ds:
        return ds
    return (ds.assign(
        **{
            variable:
            lambda ds: xr.where(ds.is_masked, ds[variable].mean(dim=dim), ds[
                variable])
        }).drop('is_masked'))
Exemplo n.º 2
0
def _normalize_lon_360(ds: xr.Dataset) -> xr.Dataset:
    """
    Fix the longitude of the given dataset ``ds`` so that it ranges from -180 to +180 degrees.

    :param ds: The dataset whose longitudes may be given in the range 0 to 360.
    :return: The fixed dataset or the original dataset.
    """

    if 'lon' not in ds.coords:
        return ds

    lon_var = ds.coords['lon']

    if len(lon_var.shape) != 1:
        return ds

    lon_size = lon_var.shape[0]
    if lon_size < 2:
        return ds

    lon_size_05 = lon_size // 2
    lon_values = lon_var.values
    if not np.any(lon_values[lon_size_05:] > 180.):
        return ds

    delta_lon = lon_values[1] - lon_values[0]

    var_names = [var_name for var_name in ds.data_vars]

    ds = ds.assign_coords(
        lon=xr.DataArray(np.linspace(-180. + 0.5 * delta_lon, +180. -
                                     0.5 * delta_lon, lon_size),
                         dims=ds['lon'].dims,
                         attrs=dict(long_name='longitude',
                                    standard_name='longitude',
                                    units='degrees east')))

    ds = adjust_spatial_attrs_impl(ds, True)

    new_vars = dict()
    for var_name in var_names:
        var = ds[var_name]
        if len(var.dims) >= 1 and var.dims[-1] == 'lon':
            values = np.copy(var.values)
            temp = np.copy(values[..., :lon_size_05])
            values[..., :lon_size_05] = values[..., lon_size_05:]
            values[..., lon_size_05:] = temp
            # import matplotlib.pyplot as plt
            # im = values[(len(values.shape) - 2) * [0] + [slice(None), slice(None)]]
            # plt.imshow(im)
            new_vars[var_name] = xr.DataArray(values,
                                              dims=var.dims,
                                              attrs=var.attrs,
                                              encoding=var.encoding)

    return ds.assign(**new_vars)
Exemplo n.º 3
0
def pack_variables(ds: Dataset) -> Dataset:
    # Remove dosage as it is unnecessary and should be redefined
    # based on encoded probabilities later (w/ reduced precision)
    ds = ds.drop_vars(["call_dosage", "call_dosage_mask"], errors="ignore")

    # Remove homozygous reference GP and redefine mask
    gp = ds["call_genotype_probability"][..., 1:]
    gp_mask = ds["call_genotype_probability_mask"].any(dim="genotypes")
    ds = ds.drop_vars(["call_genotype_probability", "call_genotype_probability_mask"])
    ds = ds.assign(call_genotype_probability=gp, call_genotype_probability_mask=gp_mask)
    return ds
Exemplo n.º 4
0
 def to(ds: Dataset) -> Dataset:
     if not is_shape_match(ds, {DIM_PLOIDY: 2, DIM_ALLELE: 2}):
         raise ValueError(
             'Dosage calculation currently only supported for bi-allelic, '
             'diploid arrays (ploidy and alelle dims must have size 2)')
     # Get array slices for ref and alt probabilities on each chromosome
     c0ref, c1ref = ds.data[..., 0, 0], ds.data[..., 1, 0]
     c0alt, c1alt = ds.data[..., 0, 1], ds.data[..., 1, 1]
     # Compute dosage as float in [0, 2]
     data = c0ref * c1alt + c0alt * c1ref + 2 * c0alt * c1alt
     data = _mask(ds.assign(data=data))
     return _transmute(GenotypeDosageDataset.create, ds, data)
Exemplo n.º 5
0
def _normalize_lon_360(ds: xr.Dataset) -> xr.Dataset:
    """
    Fix the longitude of the given dataset ``ds`` so that it ranges from -180 to +180 degrees.

    :param ds: The dataset whose longitudes may be given in the range 0 to 360.
    :return: The fixed dataset or the original dataset.
    """

    if 'lon' not in ds.coords:
        return ds

    lon_var = ds.coords['lon']

    if len(lon_var.shape) != 1:
        return ds

    lon_size = lon_var.shape[0]
    if lon_size < 2:
        return ds

    lon_size_05 = lon_size // 2
    lon_values = lon_var.values
    if not np.any(lon_values[lon_size_05:] > 180.):
        return ds

    delta_lon = lon_values[1] - lon_values[0]

    var_names = [var_name for var_name in ds.data_vars]

    ds = ds.assign_coords(lon=xr.DataArray(np.linspace(-180. + 0.5 * delta_lon,
                                                       +180. - 0.5 * delta_lon,
                                                       lon_size),
                                           dims=ds['lon'].dims,
                                           attrs=dict(long_name='longitude',
                                                      standard_name='longitude',
                                                      units='degrees east')))

    ds = adjust_spatial_attrs_impl(ds, True)

    new_vars = dict()
    for var_name in var_names:
        var = ds[var_name]
        if len(var.dims) >= 1 and var.dims[-1] == 'lon':
            values = np.copy(var.values)
            temp = np.copy(values[..., : lon_size_05])
            values[..., : lon_size_05] = values[..., lon_size_05:]
            values[..., lon_size_05:] = temp
            # import matplotlib.pyplot as plt
            # im = values[(len(values.shape) - 2) * [0] + [slice(None), slice(None)]]
            # plt.imshow(im)
            new_vars[var_name] = xr.DataArray(values, dims=var.dims, attrs=var.attrs, encoding=var.encoding)

    return ds.assign(**new_vars)
Exemplo n.º 6
0
def set_crs(dset: xr.Dataset, crs, coords=None, data_vars=None):
    grid_mapping, _ = _load_crs(dset, crs)
    dset = dset.assign({grid_mapping.name: grid_mapping})

    if coords is not None:
        dset = _add_geoattrs_to_coords(dset, grid_mapping, coords)

    if data_vars is not None:
        dset = dset.copy()
        for v in data_vars:
            dset.data_vars[v].attrs['grid_mapping'] = grid_mapping.name

    return dset
Exemplo n.º 7
0
def _normalize_lon_360(ds: xr.Dataset) -> xr.Dataset:
    """
    Fix the longitude of the given dataset ``ds`` so that it ranges from -180 to +180 degrees.

    :param ds: The dataset whose longitudes may be given in the range 0 to 360.
    :return: The fixed dataset or the original dataset.
    """

    if 'lon' not in ds.coords:
        return ds

    lon_var = ds.coords['lon']

    if len(lon_var.shape) != 1:
        return ds

    lon_size = lon_var.shape[0]
    if lon_size < 2:
        return ds

    lon_size_05 = lon_size // 2
    lon_values = lon_var.values
    if not np.any(lon_values[lon_size_05:] > 180.):
        return ds

    delta_lon = lon_values[1] - lon_values[0]

    var_names = [var_name for var_name in ds.data_vars]

    ds = ds.assign_coords(
        lon=xr.DataArray(np.linspace(-180. + 0.5 * delta_lon, +180. -
                                     0.5 * delta_lon, lon_size),
                         dims=ds['lon'].dims,
                         attrs=dict(long_name='longitude',
                                    standard_name='longitude',
                                    units='degrees east')))

    ds = adjust_spatial_attrs(ds, True)

    new_vars = dict()
    for var_name in var_names:
        var = ds[var_name]
        if 'lon' in var.dims:
            new_var = var.roll(lon=lon_size_05, roll_coords=False)
            new_var.encoding.update(var.encoding)
            new_vars[var_name] = new_var

    return ds.assign(**new_vars)
Exemplo n.º 8
0
def encode_cube(cube: xr.Dataset,
                grid_mapping: Optional[GridMapping] = None,
                non_cube_subset: Optional[xr.Dataset] = None) \
        -> xr.Dataset:
    """
    Encode a *cube* with its *grid_mapping*, and additional variables in
    *non_cube_subset* into a new dataset.

    This is the inverse of the operation :func:decode_cube:

        cube, gm, non_cube = decode_cube(dataset)
        dataset = encode_cube(cube, gm, non_cube)

    The returned data cube comprises all variables in *cube*,
    whose dimensions should be ("time" , [...], y_dim_name, x_dim_name),
    and where y_dim_name, x_dim_name are defined by *grid_mapping*, if
    given.
    If *grid_mapping* is not geographic, a new variable "crs" will
    be added that holds CF-compliant attributes which encode the
    cube's spatial CRS. *non_cube_subset*, if given may be used
    to add non-cube variables the to resulting dataset.

    :param cube: data cube dataset, whose dimensions should
        be ("time" , [...], y_dim_name, x_dim_name)
    :param grid_mapping: Optional grid mapping for *cube*.
    :param non_cube_subset: An optional dataset providing
        non-cube data variables.
    :return:
    """
    if non_cube_subset is not None:
        dataset = cube.assign(**non_cube_subset.data_vars)
    else:
        dataset = cube

    if grid_mapping is None:
        return dataset

    if grid_mapping.crs.is_geographic \
            and grid_mapping.is_regular \
            and grid_mapping.xy_dim_names == ('lon', 'lat') \
            and grid_mapping.xy_var_names == ('lon', 'lat'):
        # No need to add CRS variable
        return dataset

    return dataset.assign(crs=xr.DataArray(0, attrs=grid_mapping.crs.to_cf()))
Exemplo n.º 9
0
    def regional_with_and_withtout_flow(region):
        in_region_buses = n.buses.query('country == @region').index
        region_branches = branches.query('bus0 in @in_region_buses '
                                         'or bus1 in @in_region_buses')
        buses_i = (pd.Index(region_branches.bus0.unique())
                   | pd.Index(region_branches.bus1.unique()) | in_region_buses)
        vicinity_buses = buses_i.difference(in_region_buses)
        branches_i = region_branches.index

        K = Incidence(n, branch_components).loc[buses_i]
        # create regional injection pattern with nodal injection at the border
        # accounting for the cross border flow
        p = (K @ f)
        # p.loc[in_region_buses] ==
        #     network_injection(n, snapshots).loc[snapshots, in_region_buses].T

        # modified injection pattern without transition
        im = upper(p.loc[vicinity_buses])
        ex = lower(p.loc[vicinity_buses])

        largerImport_b = im.sum('bus') > -ex.sum('bus')
        scaleImport = (im.sum('bus') + ex.sum('bus')) / im.sum('bus')
        scaleExport = (im.sum('bus') + ex.sum('bus')) / ex.sum('bus')
        netImOrEx = (im * scaleImport).where(largerImport_b,
                                             (ex * scaleExport))
        p_wo = xr.concat([p.loc[in_region_buses], netImOrEx], dim='bus')\
                 .reindex(bus=buses_i).fillna(0)

        if 'Link' in branch_components:
            H = xr.concat((PTDF(n, branch_components, snapshot=sn)
                           for sn in snapshots), dim='snapshot')\
                  .sel(branch=branches_i)
            # f == H @ p
        else:
            H = PTDF(n, branch_components).sel(branch=branches_i)
        f_wo = H.reindex(bus=buses_i).dot(p_wo, 'bus')

        res = Dataset({'flow_with_transit': f.sel(branch=branches_i),
                       'flow_without_transit': f_wo})\
            .assign_coords(country=region)
        return res.assign(transit_flow=res.flow_with_transit -
                          res.flow_without_transit)
Exemplo n.º 10
0
def apply_data_var_remap(xds: xr.Dataset, var_name: str,
                         map_func) -> xr.Dataset:
    """Apply the map_func to the values in the given data_var"""
    import numpy as np

    def mb(array):
        vals = array.values
        newvals = np.ndarray(vals.shape, vals.dtype)
        if len(vals) > 0:
            newvals = np.vectorize(map_func, [vals.dtype])(vals)
        return xr.DataArray(data=newvals,
                            coords=array.coords,
                            dims=array.dims,
                            name=array.name,
                            attrs=array.attrs)

    assert (
        isinstance(xds[var_name], xr.DataArray)
    ), f"######### ERROR: trying to remap the data variable {var_name} which is a {type(xds[var_name])} but a {xr.DataArray} was expected!"
    var_val = xds[var_name].map_blocks(mb)
    return xds.assign({var_name: var_val})
Exemplo n.º 11
0
def insert_column_integrated_vars(
    ds: xr.Dataset, column_integrated_vars: Sequence[str]
) -> xr.Dataset:
    """Insert column integrated (<*>) terms,
    really a wrapper around vcm.calc.thermo funcs"""

    for var in column_integrated_vars:
        column_integrated_name = f"column_integrated_{var}"
        if "Q1" in var:
            da = vcm.column_integrated_heating_from_isochoric_transition(
                ds[var], ds[DELP]
            )
        elif "Q2" in var:
            da = -vcm.minus_column_integrated_moistening(ds[var], ds[DELP])
            da = da.assign_attrs(
                {"long_name": "column integrated moistening", "units": "mm/day"}
            )
        else:
            da = vcm.mass_integrate(ds[var], ds[DELP], dim="z")
        ds = ds.assign({column_integrated_name: da})

    return ds
Exemplo n.º 12
0
def add_covariates(ds: Dataset, npc: int = 20) -> Dataset:
    # See https://github.com/Nealelab/UK_Biobank_GWAS/blob/67289386a851a213f7bb470a3f0f6af95933b041/0.1/22.run_regressions.py#L71
    ds = (ds.assign(sample_age_at_recruitment_2=lambda ds: ds[
        "sample_age_at_recruitment"]**2).assign(sample_sex_x_age=lambda ds: ds[
            "sample_genetic_sex"] * ds["sample_age_at_recruitment"]).assign(
                sample_sex_x_age_2=lambda ds: ds["sample_genetic_sex"] * ds[
                    "sample_age_at_recruitment_2"]))
    covariates = np.column_stack([
        ds["sample_age_at_recruitment"].values,
        ds["sample_age_at_recruitment_2"].values,
        ds["sample_genetic_sex"].values,
        ds["sample_sex_x_age"].values,
        ds["sample_sex_x_age_2"].values,
        ds["sample_principal_component"].values[:, :npc],
    ])
    assert np.all(np.isfinite(covariates))
    ds["sample_covariate"] = xr.DataArray(covariates,
                                          dims=("samples", "covariates"))
    ds["sample_covariate"] = ds.sample_covariate.pipe(
        lambda x: (x - x.mean(dim="samples")) / x.std(dim="samples"))
    assert np.all(np.isfinite(ds.sample_covariate))
    return ds
Exemplo n.º 13
0
def process_SToF(dataset: xr.Dataset):
    """
    This isn't the best unit conversion function because it doesn't properly
    take into account the Jacobian of the coordinate conversion. This can
    be fixed by multiplying each channel by the appropriate ammount, but it might still
    be best to use the alternative method.

    :param dataset:
    :return:
    """
    e_min = dataset.attrs.get('E_min', 1)
    e_max = dataset.attrs.get('E_max', 10)
    de = dataset.attrs.get('dE', 0.01)
    ke_axis = np.linspace(e_min, e_max, (e_max - e_min) / de)

    dataset = transform_dataarray_axis(
        build_KE_coords_to_time_coords(dataset, ke_axis),
        'time',
        'eV',
        ke_axis,
        dataset,
        lambda x: x,
    )

    dataset = dataset.rename({'t_up': 'up', 't_down': 'down'})

    if 'up' in dataset.data_vars:
        # apply the sherman function corrections
        sherman = dataset.attrs.get('sherman', 0.2)
        polarization = 1 / sherman * (dataset.up - dataset.down) / (
            dataset.up + dataset.down)
        new_up = (dataset.up + dataset.down) * (1 + polarization)
        new_down = (dataset.up + dataset.down) * (1 - polarization)
        dataset = dataset.assign(up=new_up, down=new_down)

    return dataset
Exemplo n.º 14
0
def assign_net_physics_terms(ds: xr.Dataset) -> xr.Dataset:
    net_terms: Mapping[Hashable, Any] = {
        "net_heating": net_heating_from_physics(ds),
        "net_precipitation": net_precipitation_from_physics(ds),
    }
    return ds.assign(net_terms)
Exemplo n.º 15
0
def resample_in_space(dataset: xr.Dataset,
                      source_gm: GridMapping = None,
                      target_gm: GridMapping = None,
                      var_configs: Mapping[Hashable, Mapping[str,
                                                             Any]] = None):
    """
    Resample a dataset in the spatial dimensions.

    If the source grid mapping *source_gm* is not given,
    it is derived from *dataset*:
    ``source_gm = GridMapping.from_dataset(dataset)``.

    If the target grid mapping *target_gm* is not given,
    it is derived from *source_gm*:
    ``target_gm = source_gm.to_regular()``.

    If *source_gm* is almost equal to *target_gm*, this
    function is a no-op and *dataset* is returned unchanged.

    Otherwise the function computes a spatially
    resampled version of *dataset* and returns it.

    Using *var_configs*, the resampling of individual
    variables can be configured. If given, *var_configs*
    must be a mapping from variable names to configuration
    dictionaries which can have the following properties:

    * ``spline_order`` (int) - The order of spline polynomials
        used for interpolating. It is used for upsampling only.
        Possible values are 0 to 5.
        Default is 1 (bi-linear) for floating point variables,
        and 0 (= nearest neighbor) for integer and bool variables.
    * ``aggregator`` (str) - An optional aggregating
        function. It is used for downsampling only.
        Examples are numpy.nanmean, numpy.nanmin, numpy.nanmax.
        Default is numpy.nanmean for floating point variables,
        and None (= nearest neighbor) for integer and bool variables.
    * ``recover_nan`` (bool) - whether a special algorithm
        shall be used that is able to recover values that would
        otherwise yield NaN during resampling.
        Default is True for floating point variables,
        and False for integer and bool variables.

    Note that *var_configs* is only used if the resampling involves
    an affine transformation. This is true if the CRS of
    *source_gm* and CRS of *target_gm* are equal and one of two
    cases is given:

    1. *source_gm* is regular.
       In this case the resampling is the affine transformation.
       and the result is returned directly.
    2. *source_gm* is not regular and has a lower resolution
       than *target_cm*.
       In this case *dataset* is downsampled first using an affine
       transformation. Then the result is rectified.

    In all other cases, no affine transformation is applied and
    the resampling is a direct rectification.

    :param dataset: The source dataset.
    :param source_gm: The source grid mapping.
    :param target_gm: The target grid mapping. Must be regular.
    :param var_configs: Optional resampling configurations
        for individual variables.
    :return: The spatially resampled dataset.
    """
    if source_gm is None:
        # No source grid mapping given, so do derive it from dataset
        source_gm = GridMapping.from_dataset(dataset)

    if target_gm is None:
        # No target grid mapping given, so do derive it from source
        target_gm = source_gm.to_regular()

    if source_gm.is_close(target_gm):
        # If source and target grid mappings are almost equal
        return dataset

    # target_gm must be regular
    GridMapping.assert_regular(target_gm, name='target_gm')

    # Are source and target both geographic grid mappings?
    both_geographic = source_gm.crs.is_geographic \
                      and target_gm.crs.is_geographic

    if both_geographic or source_gm.crs == target_gm.crs:
        # If CRSes are both geographic or their CRSes are equal:
        if source_gm.is_regular:
            # If also the source is regular, then resampling reduces
            # to an affine transformation.
            return affine_transform_dataset(
                dataset,
                source_gm=source_gm,
                target_gm=target_gm,
                var_configs=var_configs,
            )

        # If the source is not regular, we need to rectify it,
        # so the target is regular. Our rectification implementation
        # works only correctly if source pixel size >= target pixel
        # size. Therefore check if we must downscale source first.
        x_scale = source_gm.x_res / target_gm.x_res
        y_scale = source_gm.y_res / target_gm.y_res
        if x_scale > _SCALE_LIMIT and y_scale > _SCALE_LIMIT:
            # Source pixel size >= target pixel size.
            # We can rectify.
            return rectify_dataset(dataset,
                                   source_gm=source_gm,
                                   target_gm=target_gm)

        # Source has higher resolution than target.
        # Downscale first, then rectify
        if source_gm.is_regular:
            # If source is regular
            downscaled_gm = source_gm.scale((x_scale, y_scale))
            downscaled_dataset = resample_dataset(
                dataset,
                ((x_scale, 1, 0), (1, y_scale, 0)),
                size=downscaled_gm.size,
                tile_size=source_gm.tile_size,
                xy_dim_names=source_gm.xy_dim_names,
                var_configs=var_configs,
            )
        else:
            _, downscaled_size = scale_xy_res_and_size(source_gm.xy_res,
                                                       source_gm.size,
                                                       (x_scale, y_scale))
            downscaled_dataset = resample_dataset(
                dataset,
                ((x_scale, 1, 0), (1, y_scale, 0)),
                size=downscaled_size,
                tile_size=source_gm.tile_size,
                xy_dim_names=source_gm.xy_dim_names,
                var_configs=var_configs,
            )
            downscaled_gm = GridMapping.from_dataset(
                downscaled_dataset,
                tile_size=source_gm.tile_size,
                prefer_crs=source_gm.crs)
        return rectify_dataset(downscaled_dataset,
                               source_gm=downscaled_gm,
                               target_gm=target_gm)

    # If CRSes are not both geographic and their CRSes are different
    # transform the source_gm so its CRS matches the target CRS:
    transformed_source_gm = source_gm.transform(target_gm.crs)
    transformed_x, transformed_y = transformed_source_gm.xy_coords
    reprojected_dataset = resample_in_space(dataset.assign(
        transformed_x=transformed_x, transformed_y=transformed_y),
                                            source_gm=transformed_source_gm,
                                            target_gm=target_gm)
    if not target_gm.crs.is_geographic:
        # Add 'crs' variable according to CF conventions
        reprojected_dataset = reprojected_dataset.assign(
            crs=xr.DataArray(0, attrs=target_gm.crs.to_cf()))
    return reprojected_dataset
Exemplo n.º 16
0
def process_dataset_function(dataset: xr.Dataset,
                             name: str = None,
                             value: int = None):
    dataset = dataset.copy()
    return dataset.assign(**{name: value})
Exemplo n.º 17
0
 def to(ds: Dataset) -> Dataset:
     """Convert to genotype counts"""
     data = _mask(ds.assign(data=(ds.data > 0).sum(dim=DIM_PLOIDY)))
     return _transmute(GenotypeCountDataset.create, ds, data)
Exemplo n.º 18
0
def my_postprocessor(ds: xr.Dataset) -> xr.Dataset:
    return ds.assign(crs=xr.DataArray(42))
Exemplo n.º 19
0
 def to(ds: Dataset, contig: int) -> Dataset:
     """Convert to haplotypecalls"""
     # FIXME: nonsense for testing
     data = _mask(ds.assign(data=ds.data[..., contig]))
     return _transmute(HaplotypeCallDataset.create, ds, data)
Exemplo n.º 20
0
def standardize_coords(
    ds: xr.Dataset,
    time_shift=-timedelta(minutes=7, seconds=30)) -> xr.Dataset:
    ds_shifted = ds.assign(time=ds.time + time_shift)
    return gfdl_to_standard(ds_shifted).drop("tile")