def test_urlopen_requests_kwargs(): """Test that urlopen can pass kwargs to requests.""" base_url = "https://standards.sensors.ioos.us/erddap/tabledap/" timeout_seconds = 1 # request timeout in seconds slowwly_milliseconds = (timeout_seconds + 1) * 1000 slowwly_url = f"https://flash-the-slow-api.herokuapp.com/delay/{slowwly_milliseconds}/url/{base_url}" with pytest.raises(httpx.ReadTimeout): urlopen(slowwly_url, timeout=timeout_seconds)
def test_urlopen_requests_kwargs(): """Test that urlopen can pass kwargs to requests""" base_url = "http://erddap.sensors.ioos.us/erddap/tabledap/" timeout_seconds = 1 # request timeout in seconds slowwly_milliseconds = (timeout_seconds + 1) * 1000 slowwly_url = ( f"https://flash.siwalik.in/delay/{slowwly_milliseconds}/url/{base_url}" ) with pytest.raises(ReadTimeout): urlopen(slowwly_url, timeout=timeout_seconds)
def get_dsinfo(e, stdname, cdm_data_type, min_time, max_time, skip_datasets): """This function finds all the datasets with a given standard_name in the specified time period, and return GeoJSON""" search_url = e.get_search_url( response="csv", cdm_data_type=cdm_data_type.lower(), items_per_page=100000, standard_name=stdname, min_time=min_time, max_time=max_time, ) try: df = pd.read_csv(urlopen(search_url)) for skip_dataset in skip_datasets: try: row = df.loc[df["Dataset ID"] == skip_dataset].index[0] df.drop(row, inplace=True) except IndexError: pass except HTTPError: df = pd.DataFrame([]) return df
def get_timeseries(e, dataset=None, stdname=None, constraints=None): """This function returns the specified dataset time series values as a Pandas dataframe""" var = e.get_var_by_attr( dataset_id=dataset, standard_name=lambda v: str(v).lower() == stdname.lower(), ) if var: var = var[0] else: raise ValueError(f"Cannot get data for {stdname}.") download_url = e.get_download_url( dataset_id=dataset, constraints=constraints, variables=["time", var], response="csv", ) df = pd.read_csv( urlopen(download_url), index_col="time", ) unit = df.iloc[0, 0] df = df.drop(labels=df.index[0]) df.index = pd.to_datetime( df.index, utc=True, ) df[var] = df[var].astype(float) return df, var, unit
def get_timeseries(e, dataset=None, standard_name=None, constraints=None): """This function returns the specified dataset time series values as a Pandas dataframe""" var = e.get_var_by_attr( dataset_id=dataset, standard_name=lambda v: str(v).lower() == standard_name.lower(), ) if var: var = var[0] else: raise ValueError(f"Cannot get data for {standard_name}.") # We should filter out only valid standard_names for each dataset! # df = pd.read_csv(e.get_info_url(response="csv")) # df.loc[df["Attribute Name"] == "standard_name"]["Value"].values download_url = e.get_download_url( dataset_id=dataset, constraints=constraints, variables=["time", var], response="csv", ) df = pd.read_csv( urlopen(download_url), index_col="time", parse_dates=True, skiprows=[1], ) return df, var
def _multi_urlopen(url: str) -> BinaryIO: """Simpler url open to work with multiprocessing.""" try: data = urlopen(url) except (httpx.HTTPError, httpx.ConnectError): return None return data
def get_valid_stdnames(server_name): """Find all the `standard_name` attributes that exist on this ERDDAP endpoint, using [ERDDAP's "categorize" service] (http://www.neracoos.org/erddap/categorize/index.html)""" server = servers[server_name] server_url = server.get("url") # global e e = ERDDAP(server=server_url, protocol="tabledap") url_standard_names = f"{server_url}/categorize/standard_name/index.csv" df = pd.read_csv(urlopen(url_standard_names), skiprows=[1, 2]) standard_names = list(df["Category"].values) standard_names = remove_qcstdnames(standard_names) valid_standard_names = [] count = 0 print( "Checking the variables available for this server. This might take up to a couple of minutes...\n", ) for standard_name in standard_names: count += 1 if count == np.floor(len(standard_names) / 2): print("Halfway there...\n") elif count == np.floor((len(standard_names) / 4) * 3): print("Almost done...\n") elif count == (len(standard_names)): print("Done!") features, datasets = stdname2geojson( e, standard_name, server.get("cdm_data_type"), server.get("min_time"), server.get("max_time"), server.get("skip_datasets"), ) if len(datasets ) > 0: # if there is at least one dataset with this data var = e.get_var_by_attr( dataset_id=datasets[0], standard_name=lambda v: str(v).lower() == standard_name.lower( ), ) if var != []: valid_standard_names.append(standard_name) del features, datasets return valid_standard_names, server, e
def test__tempnc(): url = "https://data.ioos.us/gliders/erddap/tabledap/cp_336-20170116T1254.nc" data = urlopen(url) with _tempnc(data) as tmp: # Check that the file was exists. assert os.path.exists(tmp) # Confirm that it is a netCDF file. assert tmp.endswith("nc") # Check that the file was removed. assert not os.path.exists(tmp)
def test__tempnc(): """Test temporary netcdf file.""" url = "https://podaac-opendap.jpl.nasa.gov/opendap/allData/modis/L3/aqua/11um/v2019.0/4km/daily/2017/365/AQUA_MODIS.20171231.L3m.DAY.NSST.sst.4km.nc" # noqa data = urlopen(url) with _tempnc(data) as tmp: # Check that the file was exists. assert os.path.exists(tmp) # Confirm that it is a netCDF file. assert tmp.endswith("nc") # Check that the file was removed. assert not os.path.exists(tmp)
def _griddap_get_constraints( dataset_url: str, step: int, ) -> Tuple[Dict, List, List]: """ Fetch metadata of griddap dataset and set initial constraints Step size is applied to all dimensions """ dds_url = f"{dataset_url}.dds" url = urlopen(dds_url) data = url.read().decode() dims, *variables = data.split("GRID") dim_list = dims.split("[")[:-1] dim_names, variable_names = [], [] for dim in dim_list: dim_name = dim.split(" ")[-1] dim_names.append(dim_name) for var in variables: phrase, *__ = var.split("[") var_name = phrase.split(" ")[-1] variable_names.append(var_name) table = pd.DataFrame({ "dimension name": [], "min": [], "max": [], "length": [] }) for dim in dim_names: url = f"{dataset_url}.csvp?{dim}" data = pd.read_csv(url).values if dim == "time": data_start = data[-1][0] else: data_start = data[0][0] table = table.append( { "dimension name": dim, "min": data_start, "max": data[-1][0], "length": len(data), }, ignore_index=True, ) table.index = table["dimension name"] table = table.drop("dimension name", axis=1) constraints_dict = {} for dim, data in table.iterrows(): constraints_dict[f"{dim}>="] = data["min"] constraints_dict[f"{dim}<="] = data["max"] constraints_dict[f"{dim}_step"] = step return constraints_dict, dim_names, variable_names
def _nc_dataset(url, auth, **requests_kwargs: Dict): """Return a netCDF4-python Dataset from memory and fallbacks to disk if that fails.""" from netCDF4 import Dataset data = urlopen(url=url, auth=auth, **requests_kwargs) try: return Dataset(Path(urlparse(url).path).name, memory=data.read()) except OSError: # if libnetcdf is not compiled with in-memory support fallback to a local tmp file with _tempnc(data) as _nc: return Dataset(_nc)
def all_datasets_locations(e, cdm_data_type, min_time, max_time): """This function returns the lon,lat values from all datasets""" url_dset = (f"{e.server}" "/tabledap/allDatasets.csv?" "datasetID,minLongitude,minLatitude&" f'cdm_data_type="{cdm_data_type}"' f"&minTime<={max_time.to_datetime_string()}" f"&maxTime>={min_time.to_datetime_string()}") url_dataset = quote(url_dset, safe=":/?&= ") del url_dset df = pd.read_csv(urlopen(url_dataset), skiprows=[1]) return df
def to_pandas(self, **kw): """Save a data request to a pandas.DataFrame. Accepts any `pandas.read_csv` keyword arguments. This method uses the .csvp [1] response as the default for simplicity, please check ERDDAP's documentation for the other csv options available. [1] Download a ISO-8859-1 .csv file with line 1: name (units). Times are ISO 8601 strings. """ response = kw.pop("response", "csvp") url = self.get_download_url(response=response, **kw) data = urlopen(url, auth=self.auth, **self.requests_kwargs) return pd.read_csv(data, **kw)
def get_valid_stdnames(server_name): """Find all the `standard_name` attributes that exist on this ERDDAP endpoint, using [ERDDAP's "categorize" service] (http://www.neracoos.org/erddap/categorize/index.html)""" server = servers[server_name] server_url = server.get("url") e = ERDDAP(server=server_url, protocol="tabledap") url_stdnames = f"{server_url}/categorize/standard_name/index.csv" df = pd.read_csv(urlopen(url_stdnames), skiprows=[1, 2]) stdnames = list(df["Category"].values) stdnames = remove_qcstdnames(stdnames) valid_stdnames = [] count = 0 display(pn.Column(pn.panel(progressbar.name), progressbar)) for stdname in stdnames: count += 1 progressbar.value = int(count / (len(stdnames)) * 100) df_stdname = get_datasets( e, stdname, server.get("cdm_data_type"), server.get("min_time"), server.get("max_time"), server.get("skip_datasets"), ) if not df_stdname.empty: var = e.get_var_by_attr( dataset_id=df_stdname.datasetID.values[0], standard_name=lambda v: str(v).lower() == stdname.lower(), ) if var != []: valid_stdnames.append(stdname) return valid_stdnames, server, e
def to_iris(self, **kw): """Load the data request into an iris.CubeList. Accepts any `iris.load_raw` keyword arguments. """ import iris response = "nc" if self.protocol == "griddap" else "ncCF" url = self.get_download_url(response=response, **kw) data = urlopen(url, auth=self.auth, **self.requests_kwargs) with _tempnc(data) as tmp: cubes = iris.load_raw(tmp, **kw) try: cubes.realise_data() except ValueError: iris.cube.CubeList([cube.data for cube in cubes]) return cubes
def get_dslocation(e, cdm_data_type, min_time, max_time): """This function returns the lon,lat values from all datasets""" max_time_str = max_time.strftime("%Y-%m-%d %H:%M:%S") min_time_str = min_time.strftime("%Y-%m-%d %H:%M:%S") url_dset = ( f"{e.server}" "/tabledap/allDatasets.csv?" "datasetID,minLongitude,minLatitude&" f'cdm_data_type="{cdm_data_type}"' f"&minTime<={max_time_str}" f"&maxTime>={min_time_str}" ) url_dataset = quote(url_dset, safe=":/?&= ") del url_dset df = pd.read_csv(urlopen(url_dataset), skiprows=[1]) return df
def _get_variables(self, dataset_id: OptionalStr = None) -> Dict: if not dataset_id: dataset_id = self.dataset_id if dataset_id is None: raise ValueError( f"You must specify a valid dataset_id, got {dataset_id}") url = self.get_info_url(dataset_id=dataset_id, response="csv") variables = {} data = urlopen(url, auth=self.auth, **self.requests_kwargs) _df = pd.read_csv(data) self._dataset_id = dataset_id for variable in set(_df["Variable Name"]): attributes = (_df.loc[_df["Variable Name"] == variable, ["Attribute Name", "Value"]].set_index( "Attribute Name").to_dict()["Value"]) variables.update({variable: attributes}) return variables
def test_urlopen_raise(): """Assure that urlopen will raise for bad URLs.""" url = "https://developer.mozilla.org/en-US/404" with pytest.raises(httpx.HTTPError): urlopen(url)
def test_urlopen(): """Assure that urlopen is always a BytesIO object.""" url = "https://standards.sensors.ioos.us/erddap/tabledap/" ret = urlopen(url) isinstance(ret, io.BytesIO)