示例#1
0
def test_s3_read_netcdf(s3):
    url = f's3://{test_bucket_name}/example_1.nc'
    s3options = dict(client_kwargs={"endpoint_url": endpoint_uri})
    source = intake.open_netcdf(url, storage_options=s3options)
    ds = source.read()
    assert ds['rh'].isel(lat=0, lon=0, time=0).values.dtype == 'float32'
    assert ds['rh'].isel(lat=0, lon=0, time=0).values == 0.5
示例#2
0
def test_http_read_netcdf_simplecache(data_server):
    url = f'simplecache::{data_server}/example_1.nc'
    source = intake.open_netcdf(url,
                                chunks={},
                                xarray_kwargs={"engine": "netcdf4"})
    ds = source.to_dask()
    assert isinstance(ds, xr.core.dataset.Dataset)
    assert isinstance(ds.temp.data, dask.array.core.Array)
示例#3
0
def test_http_read_netcdf_dask(data_server):
    url = f'{data_server}/next_example_1.nc'
    source = intake.open_netcdf(url,
                                chunks={},
                                xarray_kwargs=dict(engine='h5netcdf'))
    ds = source.to_dask()
    # assert isinstance(ds._file_obj, xr.backends.h5netcdf_.H5NetCDFStore)
    assert isinstance(ds, xr.core.dataset.Dataset)
    assert isinstance(ds.temp.data, dask.array.core.Array)
示例#4
0
def test_open_netcdf_s3_simplecache():
    bucket = 's3://its-live-data.jpl.nasa.gov'
    key = 'icesat2/alt06/rel003/ATL06_20181230162257_00340206_003_01.h5'
    url = f'simplecache::{bucket}/{key}'
    source = intake.open_netcdf(
        url,
        xarray_kwargs=dict(group='gt1l/land_ice_segments', engine='h5netcdf'),
        storage_options=dict(s3={'anon': True}),
    )
    ds = source.to_dask()
    assert isinstance(ds._file_obj, xr.backends.h5netcdf_.H5NetCDFStore)
    assert isinstance(ds, xr.core.dataarray.Dataset)
示例#5
0
def test_open_netcdf_gs():
    bucket = 'gs://ldeo-glaciology'
    key = 'bedmachine/BedMachineAntarctica_2019-11-05_v01.nc'
    url = f'{bucket}/{key}'
    source = intake.open_netcdf(
        url,
        chunks=3000,
        xarray_kwargs=dict(engine='h5netcdf'),
    )
    ds = source.to_dask()
    assert isinstance(ds._file_obj, xr.backends.h5netcdf_.H5NetCDFStore)
    assert isinstance(ds, xr.core.dataarray.Dataset)
示例#6
0
    def write_catalog(self):

        # if the catalog already exists, don't do this
        if os.path.exists(self.catalog_name):
            return

        else:
            lines = 'sources:\n'

            for filename in self.filenames:

                if 'csv' in filename:
                    file_intake = intake.open_csv(filename)
                    data = file_intake.read()
                    metadata = {
                        'variables': list(data.columns.values),
                        'geospatial_lon_min': float(data['longitude'].min()),
                        'geospatial_lat_min': float(data['latitude'].min()),
                        'geospatial_lon_max': float(data['longitude'].max()),
                        'geospatial_lat_max': float(data['latitude'].max()),
                        'time_coverage_start': data['time'].min(),
                        'time_coverage_end': data['time'].max()
                    }
                    file_intake.metadata = metadata
#                                             'time variables info': 'test', 'space variables info': 'test'}
                elif 'nc' in filename:
                    file_intake = intake.open_netcdf(filename)
                    data = file_intake.read()
                    metadata = {
                        'coords': list(data.coords.keys()),
                        'variables': list(data.data_vars.keys()),
                    }
                    file_intake.metadata = metadata

                file_intake.name = filename.split('/')[-1]
                lines += file_intake.yaml().strip('sources:')

            f = open(self.catalog_name, "w")
            f.write(lines)
            f.close()
示例#7
0
def generate_catalog(file_path_name, dataset_sub_name, parent_page, tags):
    """
    FILE_NAME: If there are more than one file, FILE_NAME is the pattern for the NetCDF files, otherwise, Name of the NetCDF file. e.g.: 'air.mon.mean.nc' 

    DATASET_SUB_NAME: Name of the directory containing the NetCDf data files, e.g.: 'GHCN_CAMS'. If there is subdirectory like monthly, daily, etc., it should also be included and separated by "_".

    PARENT_PAGE: Name of the parent directory in the dataset type hierarchy, e.g.: Temperature

    TAG: A dataset may need to be catalogued into multiple child catalogs, e.g.: "Atmosphere", "Temperature". Please keep the format consistent
    """
    file_path_name = file_path_name.strip('""')
    path, fileName = os.path.split(file_path_name)
    print("1 :" + file_path_name)
    print("2 :" + dataset_sub_name)
    print("3 :" + parent_page)
    print("4: " + tags)
    nfiles = len(glob.glob(file_path_name))
    # Set is_combine based on number of files
    if (nfiles > 1):
        is_combine = True
        print("More than one file###")
    else:
        print("one file###")
        is_combine = False

    temp = dataset_sub_name

    #print("file path name is "+ file_path_name)

    #print("dataset_sub_name is "+ dataset_sub_name)

    #print("parent page is " + parent_page)

    if int(is_combine) == True:
        # Read with xarray
        source = xr.open_mfdataset(file_path_name,
                                   combine='nested',
                                   concat_dim='time')
        src = source
        # Use intake with xarray kwargs
        source = intake.open_netcdf(file_path_name,
                                    concat_dim='time',
                                    xarray_kwargs={
                                        'combine': 'nested',
                                        'decode_times': True
                                    })
    else:
        source = intake.open_netcdf(file_path_name)
        src = xr.open_dataset(file_path_name)
        source.discover()
    #print('subname' + dataset_sub_name)
    dataset_sub_name = open(dataset_sub_name.strip('""') + '.yaml', 'w')
    dataset_sub_name.write(source.yaml())
    dataset_sub_name.close()
    print(str(dataset_sub_name.name) + " was cataloged")

    #############################################

    # CATALOG_DIR: Github repository containing the master catalog
    # NOTE: It will be more accurate later
    catalog_dir = "https://raw.githubusercontent.com/kpegion/COLA-DATASETS-CATALOG/gh-pages/intake-catalogs/"

    print(type(path))
    print(path)

    open_catalog = catalog_dir + temp + ".yaml"

    #print("Here is: {0}".format(open_catalog))
    try:
        title = src.attrs['title']
    except:
        title = dataset_sub_name
    try:
        url = src.attrs['References']
    except:
        url = ""
    # Here url roles as the location
    url = path
    html_repr = xr.core.formatting_html.dataset_repr(src).replace('\\n', '\n')
    _header = src_header(title, parent_page, open_catalog, url, tags,
                         open_catalog)

    tags = tags.split(',')
    _footer = src_footer()
    html_src = _header + html_repr + _footer
    page_name = fileName.replace('*', '').replace('..', '.')
    html_page = page_name + ".html"
    with open(html_page, "w") as file:
        file.write(html_src)

    print(html_page + " was created\n")
import xarray as xr
import intake

path = '/shared/scratch/nbehboud/gridded/temp/GHCN_CAMS/'
mean_temp = 'air.mon.mean.nc'

# Use intake with xarray kwargs
source = intake.open_netcdf(path + mean_temp)
source.discover()

mean_outf = open('ghcn_cams.yaml', 'w')
mean_outf.write(source.yaml())
mean_outf.close()
示例#9
0
    def write_catalog(self):
        """Write catalog file."""

        # if the catalog already exists, don't do this
        if os.path.exists(self.catalog_name):
            return

        else:

            f = open(self.catalog_name, "w")

            if self.axds_type == "platform2":
                lines = "sources:\n"

                for dataset_id, dataset in self.search_results.items():
                    if self.filetype == "csv":
                        urlpath = dataset["source"]["files"]["data.csv.gz"][
                            "url"]
                        file_intake = intake.open_csv(
                            urlpath, csv_kwargs=dict(parse_dates=["time"]))
                    elif self.filetype == "netcdf":
                        key = [
                            key for key in dataset["source"]["files"].keys()
                            if ".nc" in key
                        ][0]
                        urlpath = dataset["source"]["files"][key]["url"]
                        file_intake = intake.open_netcdf(
                            urlpath
                        )  # , xarray_kwargs=dict(parse_dates=['time']))
                    # to get all metadata
                    # source = intake.open_textfiles(meta_url, decoder=json.loads)
                    # source.metadata = source.read()[0]
                    meta_url = dataset["source"]["files"]["meta.json"]["url"]
                    meta_url = meta_url.replace(" ", "%20")
                    attributes = pd.read_json(meta_url)["attributes"]
                    file_intake.description = attributes["summary"]
                    metadata = {
                        "urlpath": urlpath,
                        "meta_url": meta_url,
                        "platform_category": attributes["platform_category"],
                        "geospatial_lon_min": attributes["geospatial_lon_min"],
                        "geospatial_lat_min": attributes["geospatial_lat_min"],
                        "geospatial_lon_max": attributes["geospatial_lon_max"],
                        "geospatial_lat_max": attributes["geospatial_lat_max"],
                        "source_id": attributes["packrat_source_id"],
                        "packrat_uuid": attributes["packrat_uuid"],
                        "time_coverage_start":
                        attributes["time_coverage_start"],
                        "time_coverage_end": attributes["time_coverage_end"],
                    }
                    file_intake.metadata = metadata
                    file_intake.name = attributes["packrat_uuid"]
                    lines += file_intake.yaml().strip("sources:")

            elif self.axds_type == "layer_group":
                lines = """
plugins:
  source:
    - module: intake_xarray
sources:
"""
                # catalog entries are by module uuid and unique to opendap urls
                # dataset_ids are module uuids
                for dataset_id, dataset in self.search_results.items():

                    # layer_groups associated with module
                    layer_groups = dataset["data"]["layer_group_info"]

                    # get search results for layer_groups
                    urlpaths = []
                    for layer_group_uuid in layer_groups.keys():
                        url_layer_group = self.url_builder(
                            self.url_docs_base, dataset_id=layer_group_uuid)
                        search_results_lg = requests.get(
                            url_layer_group,
                            headers=self.search_headers).json()[0]

                        if "OPENDAP" in search_results_lg["data"][
                                "access_methods"]:
                            url = search_results_lg["source"]["layers"][0][
                                "thredds_opendap_url"]
                            if ".html" in url:
                                url = url.replace(".html", "")
                            urlpaths.append(url)
                        else:
                            urlpaths.append("")
                            logger.warning(
                                f"no opendap url for module: module uuid {dataset_id}, layer_group uuid {layer_group_uuid}"
                            )
                            continue

                    # there may be different urls for different layer_groups
                    # in which case associate the layer_group uuid with the dataset
                    # since the module uuid wouldn't be unique
                    # if there were no urlpaths for any of the layer_groups,
                    # urlpaths is like ['', '', '', '', '', '', '', '']
                    if len(set(urlpaths)) > 1:
                        logger.warning(
                            f"there are multiple urls for module: module uuid {dataset_id}. urls: {set(urlpaths)}"
                        )
                        for urlpath, layer_group_uuid in zip(
                                urlpaths, layer_groups.keys()):
                            lines += self.write_catalog_layer_group_entry(
                                dataset, layer_group_uuid, urlpath,
                                layer_groups)

                    # check for when no urlpaths, don't save entry
                    # if not opendap accessible
                    elif set(urlpaths) == {""}:
                        logger.warning(
                            f"no opendap url for module: module uuid {dataset_id} for any of its layer_groups. Do not include entry in catalog."
                        )
                        continue

                    else:
                        urlpath = list(set(urlpaths))[0]
                        # use module uuid
                        lines += self.write_catalog_layer_group_entry(
                            dataset, dataset_id, urlpath, layer_groups)

            f.write(lines)
            f.close()
示例#10
0
def test_http_read_netcdf(data_server):
    url = f'{data_server}/example_1.nc'
    source = intake.open_netcdf(url)
    ds = source.read()
    assert ds['rh'].isel(lat=0, lon=0, time=0).values.dtype == 'float32'
    assert ds['rh'].isel(lat=0, lon=0, time=0).values == 0.5
示例#11
0
def test_http_open_netcdf(data_server):
    url = f'{data_server}/example_1.nc'
    source = intake.open_netcdf(url)
    ds = source.to_dask()
    assert isinstance(ds, xr.core.dataset.Dataset)
    assert isinstance(ds.temp.data, numpy.ndarray)
示例#12
0
import intake
from intake_xarray.netcdf import NetCDFSource

print(list(intake.registry))

cat_source: NetCDFSource = intake.open_netcdf(
    '/Users/tpmaxwel/Dropbox/Tom/Data/MERRA/MERRA2/6hr/*.nc4',
    concat_dim="time")
cat_source.discover()

with open("./catalog_local.yaml", 'w') as f:
    f.write(cat_source.yaml())
import intake
files_path = "/Users/tpmaxwel/Dropbox/Tom/Data/MERRA/DAILY/2005/JAN/*.nc"
datasource = intake.open_netcdf(files_path)
示例#14
0
    def write_catalog(self):
        """Write catalog file."""

        # if the catalog already exists, don't do this
        if os.path.exists(self.catalog_name):
            return

        else:
            lines = "sources:\n"

            for filename in self.filenames:

                if "csv" in filename:
                    file_intake = intake.open_csv(filename)
                    data = file_intake.read()
                    #                     # Remove skiprows entry and input header entry that we want
                    #                     file_intake._csv_kwargs.pop("skiprows")
                    #                     file_intake._csv_kwargs.update({"header": [0, 1]})
                    metadata = {
                        "variables": list(data.columns.values),
                        "geospatial_lon_min": float(data["longitude"].min()),
                        "geospatial_lat_min": float(data["latitude"].min()),
                        "geospatial_lon_max": float(data["longitude"].max()),
                        "geospatial_lat_max": float(data["latitude"].max()),
                        "time_coverage_start": data["time"].min(),
                        "time_coverage_end": data["time"].max(),
                    }
                    file_intake.metadata = metadata

                elif "nc" in filename:
                    file_intake = intake.open_netcdf(filename)
                    data = file_intake.read()
                    coords = list(data.coords.keys())
                    if "T" in data.cf.get_valid_keys():
                        time_coverage_start = str(data.cf["T"].min().values)
                        time_coverage_end = str(data.cf["T"].max().values)
                    else:
                        time_coverage_start = ""
                        time_coverage_end = ""
                    if "longitude" in data.cf.get_valid_keys():
                        geospatial_lon_min = float(data.cf["longitude"].min())
                        geospatial_lon_max = float(data.cf["longitude"].max())
                    else:
                        geospatial_lon_min = ""
                        geospatial_lon_max = ""
                    if "latitude" in data.cf.get_valid_keys():
                        geospatial_lat_min = float(data.cf["latitude"].min())
                        geospatial_lat_max = float(data.cf["latitude"].max())
                    else:
                        geospatial_lat_min = ""
                        geospatial_lat_max = ""
                    metadata = {
                        "coords": coords,
                        "variables": list(data.data_vars.keys()),
                        "time_variable": data.cf["T"].name,
                        "lon_variable": data.cf["longitude"].name,
                        "lat_variable": data.cf["latitude"].name,
                        "geospatial_lon_min": geospatial_lon_min,
                        "geospatial_lon_max": geospatial_lon_max,
                        "geospatial_lat_min": geospatial_lat_min,
                        "geospatial_lat_max": geospatial_lat_max,
                        "time_coverage_start": time_coverage_start,
                        "time_coverage_end": time_coverage_end,
                    }
                    file_intake.metadata = metadata

                file_intake.name = filename.split("/")[-1]
                lines += file_intake.yaml().strip("sources:")

            f = open(self.catalog_name, "w")
            f.write(lines)
            f.close()
 def getDataSource(self, **kwargs ) -> DataSource:
     cdim = kwargs.get("concat_dim", "time")
     datasource = intake.open_netcdf( self.files, concat_dim=cdim )
     datasource.discover()
     return datasource