示例#1
0
def log_to_file(fname):
    """
    Will append the given file path to the logger so that stdout
    and the file will be the output streams for the current logger
    """
    import logging
    from pathlib import Path as posixpath

    fname = posixpath(fname)
    fname.parent.mkdir(exist_ok=True, parents=True)

    logger = logging.getLogger("fetch_data")

    # remove existing file handlers
    for handler in logger.handlers:
        if isinstance(handler, logging.FileHandler):
            logger.handlers.remove(handler)

    # add the new logger with the formatting
    logFormatter = logging.Formatter("%(asctime)s [%(name)s]  %(message)s",
                                     datefmt="%Y-%m-%d %H:%M:%S")
    fileHandler = logging.FileHandler(fname)
    fileHandler.setFormatter(logFormatter)
    logger.addHandler(fileHandler)

    logging.info("=" * 80)
    logging.info("Start of logging session")
def reccap2_ocean_masks(url, dest):
    from pathlib import Path as posixpath
    import pooch
    import xarray as xr
    
    fname = pooch.retrieve(url, None, posixpath(url).name, dest)
    ds = xr.open_dataset(fname)
    
    return ds
示例#3
0
def get_cache_path(url, cache_dir=None):
    """
    Creates the path for the cache used to store remote file names
    Saves time in updating the
    """
    import hashlib
    import tempfile
    from pathlib import Path as posixpath

    if cache_dir is None:
        cache_dir = tempfile.gettempdir()

    cache_fname = hashlib.md5(str(url).encode()).hexdigest()
    cache_path = posixpath(f"{cache_dir}/{cache_fname}")

    return cache_path
def _get_southern_ocean_subregions(
        url='https://github.com/RECCAP2-ocean/shared-resources/raw/master/regions/RECCAP2_region_masks_all.nc',
        dest='../data/regions/'):
    import pooch
    import xarray as xr
    import pandas as pd
    from pathlib import Path as posixpath
    import itertools

    fname = pooch.retrieve(url, None, posixpath(url).name, dest)
    ds = xr.open_dataset(fname)

    mask = ds.southern

    atlantic = (((mask.lon > 290) | (mask.lon <= 20)) &
                (mask > 0)).astype(int) * 1
    indian = (((mask.lon > 20) & (mask.lon <= 147)) &
              (mask > 0)).astype(int) * 2
    pacific = (((mask.lon > 147) & (mask.lon <= 290)) &
               (mask > 0)).astype(int) * 3

    mask = xr.Dataset()
    mask['biomes'] = ds.southern.copy()
    mask['basins'] = (pacific + atlantic + indian).transpose('lat', 'lon')

    mask['subregions'] = (mask.basins * 3 + mask.biomes -
                          3).where(lambda a: a > 0).fillna(0).astype(int)

    basin = ['ATL', 'IND', 'PAC']
    biome = ['STSS', 'SPSS', 'ICE']
    names = ['-'.join(l) for l in itertools.product(basin, biome)]
    mask['names'] = xr.DataArray(names,
                                 coords={'idx': range(1, 10)},
                                 dims=('idx'))
    mask['names'].attrs['description'] = 'Names for the subregions'

    mask['subregions'].attrs['description'] = '(basins * 3 + biomes - 3)'
    mask['basins'].attrs[
        'description'] = 'Atlantic = 1, Indian = 2, Pacific = 3'
    mask['biomes'].attrs[
        'description'] = 'Biomes based on Fay and McKinley (2014), STSS=1, SPSS=2, ICE=3'
    mask.attrs['source'] = url
    mask.attrs['date'] = pd.Timestamp.today().strftime('%Y-%m-%d')
    return mask
示例#5
0
def create_download_readme(fname, **entry):
    """
    Creates a README file based on the information in the source dictionary.

    Parameters
    ----------
    name: str
        name to which file will be written
    **entry: kwargs
        must contain
    """
    import inspect
    from pathlib import Path as posixpath
    from warnings import warn

    from .utils import make_readme_file, commong_substring

    dest = entry.get("dest")

    # readme will always be overwritten
    readme_fname = posixpath(f"{dest}/{fname}")

    readme_fname.parent.mkdir(parents=True, exist_ok=True)

    url = entry.get("url", None)
    if isinstance(url, (list, tuple)):
        url = commong_substring(url) + "..."

    readme_text = make_readme_file(
        entry.get("name", ""),
        url,
        entry.get("meta", {}),
        short_info_len_limit=max([120, len(url)]),
    )

    with open(readme_fname, "w") as file:
        file.write(readme_text)
示例#6
0
def log_to_file(fname):
    import logging
    from pathlib import Path as posixpath

    fname = posixpath(fname)
    fname.parent.mkdir(exist_ok=True, parents=True)

    rootLogger = logging.getLogger()

    # remove existing file handlers
    for handler in rootLogger.handlers:
        if isinstance(handler, logging.FileHandler):
            rootLogger.handlers.remove(handler)

    # add the new logger with the formatting
    logFormatter = logging.Formatter("%(asctime)s [DOWNLOAD]  %(message)s",
                                     datefmt="%Y-%m-%d %H:%M:%S")
    fileHandler = logging.FileHandler(fname)
    fileHandler.setFormatter(logFormatter)
    rootLogger.addHandler(fileHandler)

    logging.info("=" * 80 + "\n" * 2)
    logging.info("Start of logging session")
    logging.info("-" * 80)
示例#7
0
def create_download_readme(**source_dict):
    import inspect
    from pathlib import Path as posixpath
    import logging

    dest = source_dict.get("dest").format_map(source_dict)
    cache_fname = f"{source_dict.get('dest')}/{cache}"
    manipulation = inspect.cleandoc(f"""
    Data has been downloaded directly from the server shown in URL.
    There has been no modification to the original files.
    There may be a data cache located in the annual subfolders of each
    with the format {cache_fname.replace('//', '/')}
    """)

    args = [
        source_dict.get("name", ''),
        source_dict.get("meta", {}).get("doi", None),
        source_dict.get("url", None),
        source_dict.get("meta", {}).get("citation", None),
        source_dict.get("meta", {}).get("description", None),
        source_dict.get("variables", []),
        manipulation,
    ]

    readme_fname = posixpath(f"{dest}/{readme}")
    readme_fname.parent.mkdir(parents=True, exist_ok=True)

    email = source_dict.get("email", None)
    logging = source_dict.get("download_logging", "None")

    readme_text = make_readme_file(*args,
                                   email=email,
                                   download_logging=logging)

    with open(readme_fname, "w") as file:
        file.write(readme_text)
示例#8
0
def get_url_list(
    url,
    username=None,
    password=None,
    use_cache=True,
    cache_path="./_urls_{hash}.cache",
    **kwargs,
):
    """If a url has a wildcard (*) value, remote files will be searched.

    Leverages off the `fsspec` package. This doesn't work for all HTTP urls.

    Parameters:
        url (str): If a url has a wildcard (*) value, remote files will be
            searched for
        username (str): if required for given url and protocol (e.g. FTP)
        password (str): if required for given url and protocol (e.g. FTP)
        cache_path (str): the path where the cached files will be stored. Has a
            special case where `{hash}` will be replaced with a hash based on
            the URL.
        use_cache (bool): if there is a file with cached remote urls, then
            those values will be returned as a list

    Returns:
        list: a sorted list of urls
    """
    from pathlib import Path as posixpath
    from urllib.parse import urlparse
    from .utils import make_hash_string

    if "*" not in url:
        return [url]

    if "{hash}" in cache_path:
        cache_path = cache_path.format(hash=make_hash_string(url))

    if use_cache:
        cache_path = posixpath(cache_path)
        if cache_path.is_file():
            with open(cache_path, "r") as file:
                flist = file.read().split("\n")
            logger.log(
                15,
                f"Fetched {len(flist)} files from flist cache: {cache_path}")
            return sorted(flist)

    purl = urlparse(url)
    protocol = purl.scheme
    host = purl.netloc
    path = purl.path

    logger.log(15, f"Fetching filenames from {url}")

    props = {"protocol": protocol}
    if not protocol.startswith("http"):
        props.update({"host": host})
    if username is not None:
        props["username"] = username
    if password is not None:
        props["password"] = password

    fs = fsspec.filesystem(**props)
    if protocol.startswith("http"):
        path = f"{protocol}://{host}/{path}"

    try:
        flist = fs.glob(path)
    except AttributeError:
        raise FileNotFoundError(f"The given url does not exist: {url}")
    except TypeError:
        raise KeyError(
            f"The host {protocol}://{host} does not accept username/password")

    if not protocol.startswith("https"):
        flist = [f"{protocol}://{host}{f}" for f in fs.glob(path)]

    # writing url list to cache file
    if use_cache:
        cache_path.parent.mkdir(exist_ok=True, parents=True)
        with open(cache_path, "w") as out_file:
            out_file.write("\n".join(flist))

    logger.log(15, f"Cached {len(flist)} urls to: {cache_path}")
    logging.debug(flist)

    return sorted(flist)
示例#9
0
def get_url_list(
    url,
    username=None,
    password=None,
    cache_path=None,
    use_cache=True,
    raise_on_empty=True,
):
    """
    If a url has a wildcard (*) value, remote files will be searched for.
    Leverages off the `fsspec` package. This doesnt work for all HTTP urls.
    Parameters
    ----------
    username: str
        if required for given url and protocol (e.g. FTP)
    password: str
        if required for given url and protocol (e.g. FTP)
    cache_path: str
        the path where the cached files will be stored
    use_cache: bool
        if there is a file with cached remote urls, then those
        values will be returned as a list
    raise_on_empty: bool
        if there are no files, raise an error or silently pass
    Returns
    -------
    a sorted list of urls
    """
    from pathlib import Path as posixpath
    from urllib.parse import urlparse

    from aiohttp import ClientResponseError
    from pandas import Series, read_csv

    if cache_path is None:
        cache_path = get_cache_path(url)
    else:
        cache_path = posixpath(cache_path)

    if cache_path.is_file() and use_cache:
        flist = read_csv(str(cache_path), index_col=False).iloc[:, 0].to_list()
        logging.log(
            15, f"Fetched {len(flist)} files from flist cache: {cache_path}")
        logging.debug(flist)

        return sorted(flist)

    purl = urlparse(url)
    protocol = purl.scheme
    host = purl.netloc
    path = purl.path

    logging.log(15, f"Fetching filenames from {url}")

    props = {"protocol": protocol}
    if not protocol.startswith("http"):
        props.update({"host": host})
    if username is not None:
        props["username"] = username
    if password is not None:
        props["password"] = password

    fs = fsspec.filesystem(**props)
    if protocol.startswith("http"):
        path = f"{protocol}://{host}/{path}"
        try:
            flist = fs.glob(path)
        except ClientResponseError:
            if raise_on_empty:
                raise ValueError(f"No files could be found for the url: {url}")
            else:
                return []
    else:
        flist = [f"{protocol}://{host}{f}" for f in fs.glob(path)]

    no_files = len(flist) == 0
    if no_files and raise_on_empty:
        raise ValueError(f"No files could be found for the url: {url}")

    if no_files and not use_cache:
        return flist

    cache_path.parent.mkdir(exist_ok=True, parents=True)
    # writing url list to cache file
    Series(flist, dtype="str").to_csv(str(cache_path), index=False)
    logging.log(15, f"Cached {len(flist)} urls to: {cache_path}")
    logging.debug(flist)

    return sorted(flist)