Exemplo n.º 1
0
def open_dataset(path, flavour_name='ecmwf', **kwargs):
    overrides = {}
    for k in list(kwargs):  # copy to allow the .pop()
        if k.startswith('encode_'):
            overrides[k] = kwargs.pop(k)
    store = GribDataStore.frompath(path, flavour_name=flavour_name, **overrides)
    return _open_dataset(store, **kwargs)
Exemplo n.º 2
0
def open_dataset(path,
                 flavour_name='ecmwf',
                 filter_by_keys={},
                 errors='ignore',
                 **kwargs):
    overrides = {
        'flavour_name': flavour_name,
        'filter_by_keys': filter_by_keys,
        'errors': errors,
    }
    for k in list(kwargs):  # copy to allow the .pop()
        if k.startswith('encode_'):
            overrides[k] = kwargs.pop(k)
    store = GribDataStore.frompath(path, **overrides)
    return _open_dataset(store, **kwargs)
Exemplo n.º 3
0
def load_dataset(
    name: Optional[str] = None,
    cache: bool = True,
    cache_dir: str = _default_cache_dir,
    github_url: str = "https://github.com/pangeo-data/climpred-data",
    branch: str = "master",
    extension: Optional[str] = None,
    proxy_dict: Optional[Dict[str, str]] = None,
    **kws,
) -> xr.Dataset:
    """Load example data or a mask from an online repository.

    Args:
        name: Name of the netcdf file containing the
              dataset, without the ``.nc`` extension. If ``None``, this function
              prints out the available datasets to import.
        cache: If ``True``, cache data locally for use on later calls.
        cache_dir: The directory in which to search for and cache the data.
        github_url: Github repository where the data is stored.
        branch: The git branch to download from.
        extension: Subfolder within the repository where the data is stored.
        proxy_dict: Dictionary with keys as either "http" or "https" and values as the
            proxy server. This is useful if you are on a work computer behind a
            firewall and need to use a proxy out to download data.
        kws: Keywords passed to :py:meth:`~xarray.open_dataset`.

    Returns:
        The desired :py:class:`xarray.Dataset`

    Examples:
        >>> from climpred.tutorial import load_dataset
        >>> proxy_dict = {"http": "127.0.0.1"}
        >>> ds = load_dataset("FOSI-SST", cache=False, proxy_dict=proxy_dict)
    """
    if name is None:
        return _get_datasets()

    if proxy_dict is not None:
        _initialize_proxy(proxy_dict)

    # https://stackoverflow.com/questions/541390/extracting-extension-from-
    # filename-in-python
    # Allows for generalized file extensions.
    name, ext = _os.path.splitext(name)
    if not ext.endswith(".nc"):
        ext += ".nc"

    # use aliases
    if name in FILE_ALIAS_DICT.keys():
        name = FILE_ALIAS_DICT[name]
    longdir = _os.path.expanduser(cache_dir)
    fullname = name + ext
    localfile = _os.sep.join((longdir, fullname))
    md5name = name + ".md5"
    md5file = _os.sep.join((longdir, md5name))

    if not _os.path.exists(localfile):
        # This will always leave this directory on disk.
        # May want to add an option to remove it.
        if not _os.path.isdir(longdir):
            _os.mkdir(longdir)

        if extension is not None:
            url = "/".join((github_url, "raw", branch, extension, fullname))
            _urlretrieve(url, localfile)
            url = "/".join((github_url, "raw", branch, extension, md5name))
            _urlretrieve(url, md5file)
        else:
            url = "/".join((github_url, "raw", branch, fullname))
            _urlretrieve(url, localfile)
            url = "/".join((github_url, "raw", branch, md5name))
            _urlretrieve(url, md5file)

        localmd5 = _file_md5_checksum(localfile)
        with open(md5file, "r") as f:
            remotemd5 = f.read()
        if localmd5 != remotemd5:
            _os.remove(localfile)
            msg = """
            Try downloading the file again. There was a confliction between
            your local .md5 file compared to the one in the remote repository,
            so the local copy has been removed to resolve the issue.
            """
            raise IOError(msg)

    ds = _open_dataset(localfile, **kws)

    if not cache:
        ds = ds.load()
        _os.remove(localfile)
    return ds
Exemplo n.º 4
0
def open_dataset(name,
                 cache=True,
                 cache_dir=_default_cache_dir,
                 github_url='https://github.com/bradyrx/climpred',
                 branch='master',
                 extension='sample_data/prediction',
                 **kws):
    """Load example data or a mask from an online repository.

    This is a function from `xarray.tutorial` to load an online dataset
    with minimal package imports. I am copying it here because it looks like
    it will soon be deprecated. Also, I've added the ability to point to
    data files that are not in the main folder of the repo (i.e., they are
    in subfolders).

    Note that this requires an md5 file to be loaded. Check the github
    repo bradyrx/climdata for a python script that converts .nc files into
    md5 files.

    Args:
        name: (str) Name of the netcdf file containing the dataset, without
              the .nc extension.
        cache_dir: (str, optional) The directory in which to search
                   for and cache the data.
        cache: (bool, optional) If true, cache data locally for use on later
               calls.
        github_url: (str, optional) Github repository where the data is stored.
        branch: (str, optional) The git branch to download from.
        extension: (str, optional) Subfolder within the repository where the
                   data is stored.
        kws: (dict, optional) Keywords passed to xarray.open_dataset

    Returns:
        The desired xarray dataset.
    """
    if name.endswith('.nc'):
        name = name[:-3]
    # use aliases
    if name in file_alias_dict.keys():
        name = file_alias_dict[name]
    longdir = _os.path.expanduser(cache_dir)
    fullname = name + '.nc'
    localfile = _os.sep.join((longdir, fullname))
    md5name = name + '.md5'
    md5file = _os.sep.join((longdir, md5name))

    if not _os.path.exists(localfile):
        # This will always leave this directory on disk.
        # May want to add an option to remove it.
        if not _os.path.isdir(longdir):
            _os.mkdir(longdir)

        if extension is not None:
            url = '/'.join((github_url, 'raw', branch, extension, fullname))
            _urlretrieve(url, localfile)
            url = '/'.join((github_url, 'raw', branch, extension, md5name))
            _urlretrieve(url, md5file)
        else:
            url = '/'.join((github_url, 'raw', branch, fullname))
            _urlretrieve(url, localfile)
            url = '/'.join((github_url, 'raw', branch, md5name))
            _urlretrieve(url, md5file)

        localmd5 = _file_md5_checksum(localfile)
        with open(md5file, 'r') as f:
            remotemd5 = f.read()
        if localmd5 != remotemd5:
            _os.remove(localfile)
            msg = """
            MD5 checksum does not match, try downloading dataset again.
            """
            raise IOError(msg)

    ds = _open_dataset(localfile, **kws)

    if not cache:
        ds = ds.load()
        _os.remove(localfile)

    return ds
Exemplo n.º 5
0
def load_dataset(
    name=None,
    cache=True,
    cache_dir=_default_cache_dir,
    github_url='https://github.com/bradyrx/climpred-data',
    branch='master',
    extension=None,
    proxy_dict=None,
    **kws,
):
    """Load example data or a mask from an online repository.

    Args:
        name: (str, default None) Name of the netcdf file containing the
              dataset, without the .nc extension. If None, this function
              prints out the available datasets to import.
        cache_dir: (str, optional) The directory in which to search
                   for and cache the data.
        cache: (bool, optional) If True, cache data locally for use on later
               calls.
        github_url: (str, optional) Github repository where the data is stored.
        branch: (str, optional) The git branch to download from.
        extension: (str, optional) Subfolder within the repository where the
                   data is stored.
        proxy_dict: (dict, optional) Dictionary with keys as either 'http' or
                    'https' and values as the proxy server. This is useful
                    if you are on a work computer behind a firewall and need
                    to use a proxy out to download data.
        kws: (dict, optional) Keywords passed to xarray.open_dataset

    Returns:
        The desired xarray dataset.

    Examples:
        >>> from climpred.tutorial import load_dataset()
        >>> proxy_dict = {'http': '127.0.0.1'}
        >>> ds = load_dataset('FOSI-SST', cache=False, proxy_dict=proxy_dict)
    """
    if name is None:
        return _get_datasets()

    if proxy_dict is not None:
        _initialize_proxy(proxy_dict)

    # https://stackoverflow.com/questions/541390/extracting-extension-from-
    # filename-in-python
    # Allows for generalized file extensions.
    name, ext = _os.path.splitext(name)
    if not ext.endswith('.nc'):
        ext += '.nc'

    # use aliases
    if name in FILE_ALIAS_DICT.keys():
        name = FILE_ALIAS_DICT[name]
    longdir = _os.path.expanduser(cache_dir)
    fullname = name + ext
    localfile = _os.sep.join((longdir, fullname))
    md5name = name + '.md5'
    md5file = _os.sep.join((longdir, md5name))

    if not _os.path.exists(localfile):
        # This will always leave this directory on disk.
        # May want to add an option to remove it.
        if not _os.path.isdir(longdir):
            _os.mkdir(longdir)

        if extension is not None:
            url = '/'.join((github_url, 'raw', branch, extension, fullname))
            _urlretrieve(url, localfile)
            url = '/'.join((github_url, 'raw', branch, extension, md5name))
            _urlretrieve(url, md5file)
        else:
            url = '/'.join((github_url, 'raw', branch, fullname))
            _urlretrieve(url, localfile)
            url = '/'.join((github_url, 'raw', branch, md5name))
            _urlretrieve(url, md5file)

        localmd5 = _file_md5_checksum(localfile)
        with open(md5file, 'r') as f:
            remotemd5 = f.read()
        if localmd5 != remotemd5:
            _os.remove(localfile)
            msg = """
            Try downloading the file again. There was a confliction between
            your local .md5 file compared to the one in the remote repository,
            so the local copy has been removed to resolve the issue.
            """
            raise IOError(msg)

    ds = _open_dataset(localfile, **kws)

    if not cache:
        ds = ds.load()
        _os.remove(localfile)
    return ds