print(e.get_download_url()) If we change the response to `html` we can visualize the page. def show_iframe(src): from IPython.display import HTML iframe = '<iframe src="{src}" width="100%" height="950"></iframe>'.format return HTML(iframe(src=src)) show_iframe(e.get_download_url(response="html")) Additionally, the object has `.get_info_url()` and `.get_search_url()` that can be used to obtain the info and search URLs respectively show_iframe(e.get_info_url(response="html")) show_iframe(e.get_search_url(response="html")) `erddapy` also brings some simple methods to download the data in some common data formats, like `pandas.DataFrame` and `xarray.Dataset`. df = e.to_pandas(index_col="time (UTC)", parse_dates=True,).dropna() df.head() ds = e.to_xarray(decode_times=False) ds["temperature"] Here is a simple plot using the data from `xarray`.
class ErddapReader(Reader): """ This class searches ERDDAP servers. There are 2 known_servers but others can be input too. Attributes ---------- parallel: boolean If True, run with simple parallelization using `multiprocessing`. If False, run serially. known_server: string Two ERDDAP servers are built in to be known to this reader: "ioos" and "coastwatch". e: ERDDAP server instance e.protocol: string * "tabledap" (pandas, appropriate for reading as csv) * "griddap" (xarray, appropriate for reading as netcdf) e.server: string Return the server name columns: list Metadata columns name: string "erddap_ioos", "erddap_coastwatch", or a constructed string if the user inputs a new protocol and server. reader: string reader is defined as "ErddapReader". """ def __init__(self, known_server="ioos", protocol=None, server=None, parallel=True): """ Parameters ---------- known_server: string, optional Two ERDDAP servers are built in to be known to this reader: "ioos" and "coastwatch". protocol, server: string, optional For a user-defined ERDDAP server, input the protocol as one of the following: * "tabledap" (pandas, appropriate for reading as csv) * "griddap" (xarray, appropriate for reading as netcdf) and the server address (such as "http://erddap.sensors.ioos.us/erddap" or "http://coastwatch.pfeg.noaa.gov/erddap"). parallel: boolean If True, run with simple parallelization using `multiprocessing`. If False, run serially. """ self.parallel = parallel # hard wire this for now filetype = "netcdf" # either select a known server or input protocol and server string if known_server == "ioos": protocol = "tabledap" server = "http://erddap.sensors.ioos.us/erddap" filetype = "netcdf" # other option: "csv" elif known_server == "coastwatch": protocol = "griddap" server = "http://coastwatch.pfeg.noaa.gov/erddap" filetype = "netcdf" # other option: "csv" elif known_server is not None: statement = ( "either select a known server or input protocol and server string" ) assert (protocol is not None) & (server is not None), statement else: known_server = urllib.parse.urlparse(server).netloc # known_server = server.strip("/erddap").strip("http://").replace(".", "_") statement = ( "either select a known server or input protocol and server string" ) assert (protocol is not None) & (server is not None), statement self.known_server = known_server self.e = ERDDAP(server=server) self.e.protocol = protocol self.e.server = server self.filetype = filetype # columns for metadata self.columns = [ "geospatial_lat_min", "geospatial_lat_max", "geospatial_lon_min", "geospatial_lon_max", "time_coverage_start", "time_coverage_end", "defaultDataQuery", "subsetVariables", # first works for timeseries sensors, 2nd for gliders "keywords", # for hf radar "id", "infoUrl", "institution", "featureType", "source", "sourceUrl", ] # name self.name = f"erddap_{known_server}" self.reader = "ErddapReader" self.store = dict() def __getitem__(self, key): """Redefinition of dict-like behavior. This enables user to use syntax `reader[dataset_id]` to read in and save dataset into the object. Parameters ---------- key: str dataset_id for a dataset that is available in the search/reader object. Returns ------- xarray Dataset of the data associated with key """ returned_data = self.data_by_dataset(key) # returned_data = self._return_data(key) self.__setitem__(key, returned_data) return returned_data def find_dataset_id_from_station(self, station): """Find dataset_id from station name. Parameters ---------- station: string Station name for which to search for dataset_id """ if station is None: return None # for station in self._stations: # if station has more than one word, AND will be put between # to search for multiple terms together. url = self.e.get_search_url(response="csv", items_per_page=5, search_for=station) try: df = pd.read_csv(url) except Exception as e: logger.exception(e) logger.warning( f"search url {url} did not work for station {station}.") return # first try for exact station match try: # Special case for TABS when don't split the id name if "tabs" in station: # don't split dataset_id = [ dataset_id for dataset_id in df["Dataset ID"] if station.lower() == dataset_id.lower() ][0] else: # first try as dataset_id then do as station name dataset_id = [ dataset_id for dataset_id in df["Dataset ID"] if station.lower() in [dataset_id.lower()] + dataset_id.lower().split("_") ][0] except Exception as e: logger.exception(e) logger.warning( "When searching for a dataset id to match station name %s, the first attempt to match the id did not work." % (station)) # If that doesn't work, return None for dataset_id dataset_id = None # # if that doesn't work, trying for more general match and just take first returned option # dataset_id = df.iloc[0]["Dataset ID"] return dataset_id @property def dataset_ids(self): """Find dataset_ids for server. Notes ----- The dataset_ids are found by querying the metadata through the ERDDAP server. The number of dataset_ids can change if a variable is removed from the list of variables and this is rerun. """ if not hasattr(self, "_dataset_ids") or ( self.variables and (len(self.variables) != self.num_variables)): # This should be a region search if self.approach == "region": # find all the dataset ids which we will use to get the data # This limits the search to our keyword arguments in kw which should # have min/max lon/lat/time values dataset_ids = [] if self.variables is not None: for variable in self.variables: # find and save all dataset_ids associated with variable search_url = self.e.get_search_url( response="csv", **self.kw, variableName=variable, items_per_page=10000, ) try: search = pd.read_csv(search_url) dataset_ids.extend(search["Dataset ID"]) except Exception as e: logger.exception(e) logger.warning( f"variable {variable} was not found in the search" ) logger.warning(f"search_url: {search_url}") else: # find and save all dataset_ids associated with variable search_url = self.e.get_search_url(response="csv", **self.kw, items_per_page=10000) try: search = pd.read_csv(search_url) dataset_ids.extend(search["Dataset ID"]) except Exception as e: logger.exception(e) logger.warning("nothing found in the search") logger.warning(f"search_url: {search_url}") # only need a dataset id once since we will check them each for all standard_names self._dataset_ids = list(set(dataset_ids)) # This should be a search for the station names elif self.approach == "stations": # search by station name for each of stations if self.parallel: # get metadata for datasets # run in parallel to save time num_cores = multiprocessing.cpu_count() dataset_ids = Parallel(n_jobs=num_cores)( delayed(self.find_dataset_id_from_station)(station) for station in self._stations) else: dataset_ids = [] for station in self._stations: dataset_ids.append( self.find_dataset_id_from_station(station)) # remove None from list dataset_ids = [i for i in dataset_ids if i] # In this case return all dataset_ids so they match 1-1 with # the input station list. self._dataset_ids = dataset_ids else: logger.warning( "Neither stations nor region approach were used in function dataset_ids." ) # update number of variables if self.variables: self.num_variables = len(self.variables) return self._dataset_ids def meta_by_dataset(self, dataset_id): """Return the catalog metadata for a single dataset_id.""" info_url = self.e.get_info_url(response="csv", dataset_id=dataset_id) try: info = pd.read_csv(info_url) except Exception as e: logger.exception(e) logger.warning(f"Could not read info from {info_url}") return {dataset_id: []} items = [] for col in self.columns: try: item = info[info["Attribute Name"] == col]["Value"].values[0] dtype = info[info["Attribute Name"] == col]["Data Type"].values[0] except: if col == "featureType": # this column is not present in HF Radar metadata but want it to # map to data_type, so input 'grid' in that case. item = "grid" else: item = "NA" if dtype == "String": pass elif dtype == "double": item = float(item) elif dtype == "int": item = int(item) items.append(item) # include download link ## self.e.dataset_id = dataset_id if self.e.protocol == "tabledap": # set the same time restraints as before self.e.constraints = { "time<=": self.kw["max_time"], "time>=": self.kw["min_time"], } if self.filetype == "csv": download_url = self.e.get_download_url(response="csvp") elif self.filetype == "netcdf": download_url = self.e.get_download_url(response="ncCf") elif self.e.protocol == "griddap": # the search terms that can be input for tabledap do not work for griddap # in erddapy currently. Instead, put together an opendap link and then # narrow the dataset with xarray. # get opendap link download_url = self.e.get_download_url(response="opendap") # check if "prediction" is present in metadata, esp in case of NOAA # model predictions is_prediction = "Prediction" in " ".join( list(info["Value"].replace(np.nan, None).values)) # add erddap server name return { dataset_id: [self.e.server, download_url, info_url, is_prediction] + items + [self.variables] } @property def meta(self): """Rearrange the individual metadata into a dataframe. Notes ----- This should exclude duplicate entries. """ if not hasattr(self, "_meta"): if self.parallel: # get metadata for datasets # run in parallel to save time num_cores = multiprocessing.cpu_count() downloads = Parallel(n_jobs=num_cores)( delayed(self.meta_by_dataset)(dataset_id) for dataset_id in self.dataset_ids) else: downloads = [] for dataset_id in self.dataset_ids: downloads.append(self.meta_by_dataset(dataset_id)) # make dict from individual dicts from collections import ChainMap meta = dict(ChainMap(*downloads)) # Make dataframe of metadata # variable names are the column names for the dataframe self._meta = pd.DataFrame.from_dict( meta, orient="index", columns=[ "database", "download_url", "info_url", "is_prediction" ] + self.columns + ["variable names"], ) return self._meta def data_by_dataset(self, dataset_id): """Return the data for a single dataset_id. Returns ------- A tuple of (dataset_id, data), where data type is a pandas DataFrame Notes ----- Data is read into memory. """ if self.filetype == "csv": # if self.e.protocol == "tabledap": try: # fetch metadata if not already present # found download_url from metadata and use self.e.dataset_id = dataset_id # dataset_vars gives a list of the variables in the dataset dataset_vars = (self.meta.loc[dataset_id] ["defaultDataQuery"].split("&")[0].split(",")) # vars_present gives the variables in self.variables # that are actually in the dataset vars_present = [] for selfvariable in self.variables: vp = [var for var in dataset_vars if var == selfvariable] if len(vp) > 0: vars_present.append(vp[0]) # If any variables are not present, this doesn't work. if self.variables is not None: self.e.variables = [ "time", "longitude", "latitude", "station", ] + vars_present dd = self.e.to_pandas(response="csvp", index_col=0, parse_dates=True) # dd = self.e.to_pandas(response='csv', header=[0, 1], # index_col=0, parse_dates=True) # dd = pd.read_csv( # download_url, header=[0, 1], index_col=0, parse_dates=True # ) # Drop cols and rows that are only NaNs. dd = dd.dropna(axis="index", how="all").dropna(axis="columns", how="all") if self.variables is not None: # check to see if there is any actual data # this is a bit convoluted because the column names are the variable names # plus units so can't match 1 to 1. datacols = ( 0 # number of columns that represent data instead of metadata ) for col in dd.columns: datacols += [ varname in col for varname in self.variables ].count(True) # if no datacols, we can skip this one. if datacols == 0: dd = None except Exception as e: logger.exception(e) logger.warning("no data to be read in for %s" % dataset_id) dd = None elif self.filetype == "netcdf": # elif self.e.protocol == "griddap": if self.e.protocol == "tabledap": try: # assume I don't need to narrow in space since time series (tabledap) self.e.dataset_id = dataset_id dd = self.e.to_xarray() # dd = xr.open_dataset(download_url, chunks="auto") dd = dd.swap_dims({"obs": dd.cf["time"].name}) dd = dd.sortby(dd.cf["time"], ascending=True) dd = dd.cf.sel( T=slice(self.kw["min_time"], self.kw["max_time"])) # dd = dd.set_coords( # [dd.cf["longitude"].name, dd.cf["latitude"].name] # ) # use variable names to drop other variables (should. Ido this?) if self.variables is not None: # I don't think this is true with new approach # # ERDDAP prepends variables with 's.' in netcdf files, # # so include those with variables # erd_vars = [f's.{var}' for var in self.variables] # var_list = set(dd.data_vars) - (set(self.variables) | set(erd_vars)) var_list = set(dd.data_vars) - set(self.variables) dd = dd.drop_vars(var_list) # the lon/lat are on the 'timeseries' singleton dimension # but the data_var variable was not, which messed up # cf-xarray. When longitude and latitude are not on a # dimension shared with a variable, the variable can't be # called with cf-xarray. e.g. dd.cf['ssh'] won't work. if "timeseries" in dd.dims: for data_var in dd.data_vars: if "timeseries" not in dd[data_var].dims: dd[data_var] = dd[data_var].expand_dims( dim="timeseries", axis=1) except Exception as e: logger.exception(e) logger.warning("no data to be read in for %s" % dataset_id) dd = None elif self.e.protocol == "griddap": try: # this makes it read in the whole file which might be large self.e.dataset_id = dataset_id # dd = self.e.to_xarray(chunks="auto").sel( # time=slice(self.kw["min_time"], self.kw["max_time"]) # ) download_url = self.e.get_download_url(response="opendap") dd = xr.open_dataset(download_url, chunks="auto").sel( time=slice(self.kw["min_time"], self.kw["max_time"])) if ("min_lat" in self.kw) and ("max_lat" in self.kw): dd = dd.sel(latitude=slice(self.kw["min_lat"], self.kw["max_lat"])) if ("min_lon" in self.kw) and ("max_lon" in self.kw): dd = dd.sel(longitude=slice(self.kw["min_lon"], self.kw["max_lon"])) # use variable names to drop other variables (should. Ido this?) if self.variables is not None: vars_list = set(dd.data_vars) - set(self.variables) dd = dd.drop_vars(vars_list) except Exception as e: logger.exception(e) logger.warning("no data to be read in for %s" % dataset_id) dd = None # return (dataset_id, dd) return dd # @property def data(self, dataset_ids=None): """Read in data for some or all dataset_ids. NOT USED CURRENTLY Once data is read in for a dataset_ids, it is remembered. See full documentation in `utils.load_data()`. """ output = odg.utils.load_data(self, dataset_ids) return output
# In[9]: search_url = e.get_search_url(response='csv', **kw) search = pd.read_csv(search_url) gliders = search['Dataset ID'].values msg = 'Found {} Glider Datasets:\n\n{}'.format print(msg(len(gliders), '\n'.join(gliders))) # With the Dataset IDs we can explore the metadata with the *get_info_url* # In[10]: print(gliders[0]) info_url = e.get_info_url(dataset_id=gliders[0], response='csv') info = pd.read_csv(info_url) info.head() # In[11]: cdm_profile_variables = info.loc[info['Attribute Name'] == 'cdm_profile_variables', 'Value'] print(''.join(cdm_profile_variables)) # # Selecting variables by attributes # In[12]:
def get_coordinates(df, **kw): ''' Example ERDDAP TableDAP URL: dataset_url = '%s/tabledap/%s.csvp?latitude,longitude,time&longitude>=-72.0&longitude<=-69&latitude>=38&latitude<=41&time>=1278720000.0&time<=1470787200.0&distinct()' % (all_datasets['server'].iloc[int(i)],all_datasets['Dataset ID'].iloc[int(i)]) ''' df_coords = pd.DataFrame() # alternate approach to above is iterate the original DataFrame passed (df), stopping either # at final_dataset_limit (10 currently) or the max # of rows in df (conclusion of for loop) # previous enclosing while loop is unnecessary as a result final_dataset_limit = 10 datasets_found = 0 if df.shape[0] < final_dataset_limit: final_dataset_limit = df.shape[0] index_random = random.sample(range(0, df.shape[0]), df.shape[0]) print("index_random: {}".format(index_random)) #for i in range(subset_datasets.shape[0]): for i in index_random: server_url = df['server'].iloc[int(i)] dataset_id = df['Dataset ID'].iloc[int(i)] institution = df['Institution'].iloc[int(i)] # skip some difficult datasets for now: if "ROMS" in dataset_id or "DOP" in dataset_id: # skip ROMS model output #print("Skipping %s" % server_url + dataset_id) continue e = ERDDAP(server=server_url, protocol='tabledap', response='csv') try: print("datasets_found: {}".format(datasets_found)) # former config for query, replaced with new code below: #e.variables=["latitude","longitude"]#,"time"] #e.dataset_id = all_datasets['Dataset ID'].iloc[int(i)] #e.constraints = { # "time>=": kw['min_time'], # "time<=": kw['max_time'], # "longitude>=": kw['min_lon'], # "longitude<=": kw['max_lon'], # "latitude>=": kw['min_lat'], # "latitude<=": kw['max_lat'], # "distinct" : () #} # Generate a download URL via e.get_download_url and pass to Pandas DataFrame via read_csv # we need to use e.constraints here rather than in e.get_download_url to allow appending '>=' '<=' to the contstraints keys to match ERDDAP's API # (parameter signature differs from the search API used above) # also add a 'distinct = ()' param, generate a download url, and submit a csv dataset download request to ERDDAP #kw["distinct"] = "()" e.constraints = { "time>=": kw['min_time'], "time<=": kw['max_time'], "longitude>=": kw['min_lon'], "longitude<=": kw['max_lon'], "latitude>=": kw['min_lat'], "latitude<=": kw['max_lat'], "distinct": () } url = e.get_download_url( #constraints=kw, response="csvp", dataset_id=df['Dataset ID'].iloc[int(i)], variables=["latitude", "longitude"]) print("Download URL: {}".format(url)) #coords = pd.read_csv(url, headers=headers) coords = pd.read_csv(url) coords['dataset_count'] = i coords['dataset_download_url'] = url coords['Dataset ID'] = dataset_id coords['Institution'] = institution metadata_url = e.get_info_url( dataset_id=df['Dataset ID'].iloc[int(i)], response='csv') metadata = pd.read_csv(metadata_url) coords['cdm_data_type'] = "".join( metadata.loc[metadata["Attribute Name"] == "cdm_data_type", "Value"]) #get_var_by_attr example (ToDo): #e.get_var_by_attr(dataset_id, standard_name='northward_sea_water_velocity') print(coords.head()) df_coords = pd.concat([df_coords, coords]) # reaching this point in the query means the dataset query was successful, increment # we need to break out of for loop here however if we reach final_dataset_limit to not go over: datasets_found += 1 print("new dataset acquired; datasets_found: {}".format( datasets_found)) if datasets_found == final_dataset_limit: break except Exception as ex: # can happen if the dataset does not have any features within the query window, just log it here: if type(ex).__name__ in ["HTTPError"]: print(ex) #raise pass return df_coords
# * BUOY (surface buoy) # * MFN (multifunction node - on the bottom of the ocean) # * NSIF (near-surface instrument frame - located at 7 m depth) # # First, lets try the CTDBP on the NSIF: url = erd.get_search_url(search_for='"CP01CNSM NSIF CTDBP"', response='csv') datasets = to_df(url)['Dataset ID'] datasets erd.dataset_id = datasets[0] # Check what variables are available on the dataset: info_url = erd.get_info_url(response='html') show_iframe(info_url) info_url = erd.get_info_url(response='csv') info_df = to_df(info_url) info_df info_df[info_df['Row Type'] == 'variable'] # Take a look at the variables with standard names: variables = erd.get_var_by_attr(standard_name=lambda v: v is not None) variables # These are the standard variables for the CTDBP instrument - specifically for the CP01CNSM-NSIF-CTDBP. Next, lets query the server for _all_ available data from the CP01CNSM-NSIF-CTDBP.
class ErddapReader: def __init__(self, known_server='ioos', protocol=None, server=None, parallel=True): # # run checks for KW # self.kw = kw self.parallel = parallel # either select a known server or input protocol and server string if known_server == 'ioos': protocol = 'tabledap' server = 'http://erddap.sensors.ioos.us/erddap' elif known_server == 'coastwatch': protocol = 'griddap' server = 'http://coastwatch.pfeg.noaa.gov/erddap' elif known_server is not None: statement = 'either select a known server or input protocol and server string' assert (protocol is not None) & (server is not None), statement else: known_server = server.strip('/erddap').strip('http://').replace('.','_') statement = 'either select a known server or input protocol and server string' assert (protocol is not None) & (server is not None), statement self.known_server = known_server self.e = ERDDAP(server=server) self.e.protocol = protocol self.e.server = server # columns for metadata self.columns = ['geospatial_lat_min', 'geospatial_lat_max', 'geospatial_lon_min', 'geospatial_lon_max', 'time_coverage_start', 'time_coverage_end', 'defaultDataQuery', 'subsetVariables', # first works for timeseries sensors, 2nd for gliders 'keywords', # for hf radar 'id', 'infoUrl', 'institution', 'featureType', 'source', 'sourceUrl'] # name self.name = f'erddap_{known_server}' self.reader = 'ErddapReader' # # self.data_type = data_type # self.standard_names = standard_names # # DOESN'T CURRENTLY LIMIT WHICH VARIABLES WILL BE FOUND ON EACH SERVER @property def dataset_ids(self): '''Find dataset_ids for server.''' if not hasattr(self, '_dataset_ids'): # This should be a region search if self.approach == 'region': # find all the dataset ids which we will use to get the data # This limits the search to our keyword arguments in kw which should # have min/max lon/lat/time values dataset_ids = [] if self.variables is not None: for variable in self.variables: # find and save all dataset_ids associated with variable search_url = self.e.get_search_url(response="csv", **self.kw, variableName=variable, items_per_page=10000) try: search = pd.read_csv(search_url) dataset_ids.extend(search["Dataset ID"]) except Exception as e: logger_erd.exception(e) logger_erd.warning(f"variable {variable} was not found in the search") logger_erd.warning(f'search_url: {search_url}') else: # find and save all dataset_ids associated with variable search_url = self.e.get_search_url(response="csv", **self.kw, items_per_page=10000) try: search = pd.read_csv(search_url) dataset_ids.extend(search["Dataset ID"]) except Exception as e: logger_erd.exception(e) logger_erd.warning(f"nothing found in the search") logger_erd.warning(f'search_url: {search_url}') # only need a dataset id once since we will check them each for all standard_names self._dataset_ids = list(set(dataset_ids)) # This should be a search for the station names elif self.approach == 'stations': # elif self._stations is not None: # search by station name for each of stations dataset_ids = [] for station in self._stations: # if station has more than one word, AND will be put between to search for multiple # terms together url = self.e.get_search_url(response="csv", items_per_page=5, search_for=station) try: df = pd.read_csv(url) except Exception as e: logger_erd.exception(e) logger_erd.warning(f'search url {url} did not work for station {station}.') continue # first try for exact station match try: dataset_id = [dataset_id for dataset_id in df['Dataset ID'] if station.lower() in dataset_id.lower().split('_')][0] # if that doesn't work, trying for more general match and just take first returned option except Exception as e: logger_erd.exception(e) logger_erd.warning('When searching for a dataset id to match station name %s, the first attempt to match the id did not work.' % (station)) dataset_id = df.iloc[0]['Dataset ID'] # if 'tabs' in org_id: # don't split # axiom_id = [axiom_id for axiom_id in df['Dataset ID'] if org_id.lower() == axiom_id.lower()] # else: # axiom_id = [axiom_id for axiom_id in df['Dataset ID'] if org_id.lower() in axiom_id.lower().split('_')][0] # except: # dataset_id = None dataset_ids.append(dataset_id) self._dataset_ids = list(set(dataset_ids)) else: logger_erd.warning('Neither stations nor region approach were used in function dataset_ids.') return self._dataset_ids def meta_by_dataset(self, dataset_id): info_url = self.e.get_info_url(response="csv", dataset_id=dataset_id) info = pd.read_csv(info_url) items = [] for col in self.columns: try: item = info[info['Attribute Name'] == col]['Value'].values[0] dtype = info[info['Attribute Name'] == col]['Data Type'].values[0] except: if col == 'featureType': # this column is not present in HF Radar metadata but want it to # map to data_type, so input 'grid' in that case. item = 'grid' else: item = 'NA' if dtype == 'String': pass elif dtype == 'double': item = float(item) elif dtype == 'int': item = int(item) items.append(item) # if self.standard_names is not None: # # In case the variable is named differently from the standard names, # # we back out the variable names here for each dataset. This also only # # returns those names for which there is data in the dataset. # varnames = self.e.get_var_by_attr( # dataset_id=dataset_id, # standard_name=lambda v: v in self.standard_names # ) # else: # varnames = None ## include download link ## self.e.dataset_id = dataset_id if self.e.protocol == 'tabledap': if self.variables is not None: self.e.variables = ["time","longitude", "latitude", "station"] + self.variables # set the same time restraints as before self.e.constraints = {'time<=': self.kw['max_time'], 'time>=': self.kw['min_time'],} download_url = self.e.get_download_url(response='csvp') elif self.e.protocol == 'griddap': # the search terms that can be input for tabledap do not work for griddap # in erddapy currently. Instead, put together an opendap link and then # narrow the dataset with xarray. # get opendap link download_url = self.e.get_download_url(response='opendap') # add erddap server name return {dataset_id: [self.e.server, download_url] + items + [self.variables]} @property def meta(self): if not hasattr(self, '_meta'): if self.parallel: # get metadata for datasets # run in parallel to save time num_cores = multiprocessing.cpu_count() downloads = Parallel(n_jobs=num_cores)( delayed(self.meta_by_dataset)(dataset_id) for dataset_id in self.dataset_ids ) else: downloads = [] for dataset_id in self.dataset_ids: downloads.append(self.meta_by_dataset(dataset_id)) # make dict from individual dicts from collections import ChainMap meta = dict(ChainMap(*downloads)) # Make dataframe of metadata # variable names are the column names for the dataframe self._meta = pd.DataFrame.from_dict(meta, orient='index', columns=['database','download_url'] \ + self.columns + ['variable names']) return self._meta def data_by_dataset(self, dataset_id): download_url = self.meta.loc[dataset_id, 'download_url'] # data variables in ds that are not the variables we searched for # varnames = self.meta.loc[dataset_id, 'variable names'] if self.e.protocol == 'tabledap': try: # fetch metadata if not already present # found download_url from metadata and use dd = pd.read_csv(download_url, index_col=0, parse_dates=True) # Drop cols and rows that are only NaNs. dd = dd.dropna(axis='index', how='all').dropna(axis='columns', how='all') if self.variables is not None: # check to see if there is any actual data # this is a bit convoluted because the column names are the variable names # plus units so can't match 1 to 1. datacols = 0 # number of columns that represent data instead of metadata for col in dd.columns: datacols += [varname in col for varname in self.variables].count(True) # if no datacols, we can skip this one. if datacols == 0: dd = None except Exception as e: logger_erd.exception(e) logger_erd.warning('no data to be read in for %s' % dataset_id) dd = None elif self.e.protocol == 'griddap': try: dd = xr.open_dataset(download_url, chunks='auto').sel(time=slice(self.kw['min_time'],self.kw['max_time'])) if ('min_lat' in self.kw) and ('max_lat' in self.kw): dd = dd.sel(latitude=slice(self.kw['min_lat'],self.kw['max_lat'])) if ('min_lon' in self.kw) and ('max_lon' in self.kw): dd = dd.sel(longitude=slice(self.kw['min_lon'],self.kw['max_lon'])) # use variable names to drop other variables (should. Ido this?) if self.variables is not None: l = set(dd.data_vars) - set(self.variables) dd = dd.drop_vars(l) except Exception as e: logger_erd.exception(e) logger_erd.warning('no data to be read in for %s' % dataset_id) dd = None return (dataset_id, dd) @property def data(self): if not hasattr(self, '_data'): if self.parallel: num_cores = multiprocessing.cpu_count() downloads = Parallel(n_jobs=num_cores)( delayed(self.data_by_dataset)(dataset_id) for dataset_id in self.dataset_ids ) else: downloads = [] for dataset_id in self.dataset_ids: downloads.append(self.data_by_dataset(dataset_id)) # if downloads is not None: dds = {dataset_id: dd for (dataset_id, dd) in downloads} # else: # dds = None self._data = dds return self._data def count(self,url): try: return len(pd.read_csv(url)) except: return np.nan def all_variables(self): '''Return a list of all possible variables.''' file_name_counts = f'erddap_variable_list_{self.known_server}.csv' if os.path.exists(file_name_counts): return pd.read_csv(file_name_counts, index_col='variable') else: # This took 10 min running in parallel for ioos # 2 min for coastwatch url = f'{self.e.server}/categorize/variableName/index.csv?page=1&itemsPerPage=100000' df = pd.read_csv(url) # counts = [] # for url in df.URL: # counts.append(self.count(url)) num_cores = multiprocessing.cpu_count() counts = Parallel(n_jobs=num_cores)( delayed(self.count)(url) for url in df.URL ) dfnew = pd.DataFrame() dfnew['variable'] = df['Category'] dfnew['count'] = counts dfnew = dfnew.set_index('variable') # remove nans if (dfnew.isnull().sum() > 0).values: dfnew = dfnew[~dfnew.isnull().values].astype(int) dfnew.to_csv(file_name_counts) return dfnew def search_variables(self, variables): '''Find valid variables names to use. Call with `search_variables()` to return the list of possible names. Call with `search_variables('salinity')` to return relevant names. ''' if not isinstance(variables, list): variables = [variables] # set up search for input variables search = f"(?i)" for variable in variables: search += f".*{variable}|" search = search.strip('|') r = re.compile(search) # just get the variable names df = self.all_variables() parameters = df.index matches = list(filter(r.match, parameters)) # return parameters that match input variable strings return df.loc[matches].sort_values('count', ascending=False) def check_variables(self, variables, verbose=False): if not isinstance(variables, list): variables = [variables] # parameters = list(self.all_variables().keys()) parameters = list(self.all_variables().index) # for a variable to exactly match a parameter # this should equal 1 count = [] for variable in variables: count += [parameters.count(variable)] condition = np.allclose(count,1) assertion = f'The input variables are not exact matches to ok variables for known_server {self.known_server}. \ \nCheck all parameter group values with `ErddapReader().all_variables()` \ \nor search parameter group values with `ErddapReader().search_variables({variables})`.\ \n\n Try some of the following variables:\n{str(self.search_variables(variables))}'# \ # \nor run `ErddapReader().check_variables("{variables}")' assert condition, assertion if condition and verbose: print('all variables are matches!')
class NDBC(): def __init__(self, station_id, deploy_id, WMO, currentTime, startTime, data_map, name_map): self.station_id = station_id self.deploy_id = deploy_id self.WMO = WMO self.now = currentTime self.startTime = startTime self.data_map = data_map self.name_map = name_map def adjust_pressure_to_sea_level(self, pres, temp, height): """Adjust barometric presure to sea-level.""" temp = temp + 273.15 slp = pres / np.exp(-height / (temp * 29.263)) return slp def calculate_wind_speed(self, eastward, northward): """Calculate absolute wind speed from component wind vector.""" u = np.square(eastward) v = np.square(northward) wind_speed = np.sqrt(u + v) return wind_speed def calculate_wind_direction(self, eastward, northward): """Calculate met wind direction from component wind vectors.""" u = eastward v = northward wind_direction = 180 / np.pi * np.arctan2(-u, -v) return wind_direction def _connect_erddap(self, server="http://ooivm1.whoi.net/erddap", protocol="tabledap"): """Connect to the erddap server.""" self._erddap = ERDDAP(server=server, protocol=protocol) def list_datasets(self): """Get the available datasets for the ERDDAP server.""" # First, make the connection self._connect_erddap() # Next, get the datasets datasets = pd.read_csv( self._erddap.get_search_url(search_for=self.station_id, response='csv'))['Dataset ID'] return datasets def get_dataset(self, dataset): """Get the data for specified datasets.""" # First, have to re-establish the erddap connection self._connect_erddap() # Next, get the data for a dataset self._erddap.dataset_id = dataset # Only want the variables with standard names variables = self._erddap.get_var_by_attr( standard_name=lambda v: v is not None) self._erddap.variables = variables # Limit the data request to the current deployment self._erddap.constraints = { 'deploy_id=': self.deploy_id, 'time>=': self.startTime.strftime('%Y-%m-%dT%H:%M:%SZ') } try: # Download the data data = self._erddap.to_pandas(index_col='time (UTC)', parse_dates=True) # Sometimes it just returns an empty dataframe instead of an error if data.size == 0: data = self._create_empty_dataset() except: # If there is no available data in the requested time window, need # to create an empty dataframe of the data data = self._create_empty_dataset() # Return the dataset data return data def process_METBK_data(self, df, freq='10T'): """Process the METBK into the correct format and values for NDBC.""" # Resample the data df_binned = df.resample(freq).mean() # Check that barometric pressure if 'barometric_pressure (mbar)' in df_binned.columns: # Adjust the barometric pressure to sea-level df_binned[ 'sea_level_pressure (hPa)'] = self.adjust_pressure_to_sea_level( df_binned['barometric_pressure (mbar)'], df_binned['air_temperature (degree_Celsius)'], 4.05) else: df_binned['sea_level_pressure (hPa)'] = np.nan # Check that the wind vector components are in the dataframe if 'eastward_wind_velocity (m s-1)' in df_binned.columns: # Calculate the wind speed df_binned['wind speed (m/s)'] = self.calculate_wind_speed( df_binned['eastward_wind_velocity (m s-1)'], df_binned['northward_wind_velocity (m s-1)']) # Calculate the wind direction df_binned['wind direction'] = self.calculate_wind_direction( df_binned['eastward_wind_velocity (m s-1)'], df_binned['northward_wind_velocity (m s-1)']) df_binned['wind direction'] = df_binned["wind direction"].apply( lambda x: x + 360 if x < 0 else x) # Don't need cardinal direction -> want direction in degrees # df_binned["wind direction"] = df_binned["wind direction"].apply( # lambda x: self.get_cardinal_direction(np.round(x, decimals=2))) else: df_binned['wind speed (m/s)'] = np.nan df_binned['wind direction'] = np.nan # Return the processed data return df_binned def process_WAVSS_data(self, df, freq='10T'): """Much simpler function for processing the WAVSS data.""" # Resample the data df_binned = df.resample(freq).mean() # Return the data return df_binned def _create_empty_dataset(self): """ Create a dataset of all nans if there is no data available for the requested dataset in the given time period. """ # Get the units for the corresponding variables info_url = self._erddap.get_info_url( dataset_id=self._erddap.dataset_id, response='csv') info = pd.read_csv(info_url) units = info[info['Attribute Name'] == 'units'] # Now, add the units to the variable names columns = [] for var in self._erddap.variables: unit = units[units['Variable Name'] == var]['Value'].values if len(unit) == 0: columns.append(f'{var}') elif var == 'time': pass else: columns.append(f'{var} ({unit[0]})') # Create an array of nans to fill out the empty dataframe empty_array = np.empty((2, len(columns))) empty_array[:] = np.nan # Put the empty array into a dataframe empty_df = pd.DataFrame(data=empty_array, columns=columns, index=[self.startTime, self.now]) empty_df.index.name = 'time (UTC)' return empty_df def process_datasets(self, datasets): """Process the data for individual datasets.""" self.datasets = datasets # Get the data for the individual datasets for dset in self.datasets.keys(): self.datasets.update({dset: self.get_dataset(dset)}) # Process the data for dset in self.datasets.keys(): if 'METBK' in dset: self.datasets[dset] = self.process_METBK_data( self.datasets[dset]) else: self.datasets[dset] = self.process_WAVSS_data( self.datasets[dset]) # Add a header to the data in the datasets for key in self.datasets.keys(): header = key.split('-', 2)[-1] for col in self.datasets.get(key).columns: self.datasets.get(key).rename( columns={col: ' '.join((header, col))}, inplace=True) def parse_data_to_xml(self, data): """ Function which takes in the 10-minute average buoy data, the station name, and two dictionaries which map the buoy column names to the xml tags, and outputs an xml file in the NDBC format. Returns: xml - a properly constructed xml file in the NDBC format for the given buoy data """ # Start the xml file xml = ['<?xml version="1.0" encoding="ISO-8859-1"?>'] # Iterate through the data for index in data.index: # Get the data associated with a row in the dataframe row = data.loc[index] # Reset a dictionary of the data xml_data = {} for key in self.data_map.keys(): xml_data.update({key: self.data_map.get(key)}) # Parse the data into the data dictionary for key in xml_data.keys(): # Get the column name which corresponds to the ndbc tag column = self.name_map.get(key) # Check that the column was returned from the ERDDAP server if column in row.index: value = row[column] # If a nan, just leave it the default -9999 if str(value) == 'nan': pass else: xml_data[key] = value # If no data, leave it as default -9999 else: pass # Write the parsed data to the xml file # Start the message xml.append('<message>') # Add in the station id xml.append(f' <station>{self.WMO}</station>') # Get the time index time = row.name.strftime('%m/%d/%Y %H:%M:%S') xml.append(f' <date>{time}</date>') # Missing fill value missing = str(-9999) xml.append(f' <missing>{missing}</missing>') # Roundtime xml.append(' <roundtime>no</roundtime>') # Start of the data xml.append(' <met>') # Add in each data piece for tag in xml_data.keys(): # Get the value value = xml_data.get(tag) value = str(value) # Add the data to the xml file xml.append(f' <{tag}>{value}</{tag}>') # Finish off the message xml.append(' </met>') xml.append('</message>') # Return the results return xml
search_url = e.get_search_url(response='csv', **kw) search = pd.read_csv(search_url) gliders = search['Dataset ID'].values msg = 'Found {} Glider Datasets:\n\n{}'.format print(msg(len(gliders), '\n'.join(gliders))) # With the Dataset IDs we can explore the metadata with the *get_info_url* # In[10]: print(gliders[0]) info_url = e.get_info_url(dataset_id=gliders[0], response='csv') info = pd.read_csv(info_url) info.head() # In[11]: cdm_profile_variables = info.loc[ info['Attribute Name'] == 'cdm_profile_variables', 'Value' ] print(''.join(cdm_profile_variables)) # # Selecting variables by attributes
def get_standard_variables_and_metadata(server_link, standard_variable_list): # Get access to the server and find datasets associated with standard_name variable listed e = ERDDAP(server=server_link, protocol='tabledap', response='csv') # Define Filter for which datasets to look into kw = { 'standard_name': ','.join(standard_variable_list), 'min_lon': -180.0, 'max_lon': 180.0, 'min_lat': -90.0, 'max_lat': 90.0, 'min_time': '', 'max_time': '', 'cdm_data_type': '' } variable_to_groupby = [('latitude', 'degrees_north'), ('longitude', 'degrees_east')] # Get available datasets from that server search_url = e.get_search_url(response='csv', **kw) datasets = pd.read_csv(search_url) # Print results print(e.server) print( str(len(datasets)) + " datasets contains " + ', '.join(standard_variable_list)) # Loop through different data sets and create a metadata dataFrame df = pd.DataFrame(columns=['Dataset ID']) for index, row in datasets.iterrows(): # Get Info from dataset (mostly min/max lat/long) print(row['Dataset ID']) info_url = e.get_info_url(dataset_id=row['Dataset ID'], response='csv') info = pd.read_csv(info_url) attribute_table = info.set_index( ['Row Type', 'Variable Name', 'Attribute Name']).transpose()['attribute'] # Try to get the distinct lat/long and time and depth range for that dataset, if it fails rely on the # ERDDAP metadata try: # If dataset is spread out geographically find distinct locations (may not work well for trajectory data) latlong_url = e.get_download_url( dataset_id=row['Dataset ID'], protocol='tabledap', variables=['latitude', 'longitude', 'time']) # Get add to the url commands to get distinct values and ordered with min and max time for each lat/long distinctMinMaxTime_url = latlong_url + '&distinct()&orderByMinMax(%22latitude%2Clongitude%2Ctime%22)' # Get lat/long and min/max depth for this dataset data = pd.read_csv(distinctMinMaxTime_url, header=[0, 1]) # Group data by latitude/longitude and get min max values data_reduced = data.groupby(by=variable_to_groupby).agg( ['min', 'max']).reset_index() if info[(info['Variable Name'] == 'depth')].size > 0: latlongdepth_url = e.get_download_url( dataset_id=row['Dataset ID'], protocol='tabledap', variables=['latitude', 'longitude', 'depth']) # Get add to the url commands to get distinct values and ordered with min and max depth for # each lat/long distinctMinMaxDepth_url = latlongdepth_url + \ '&distinct()&orderByMinMax(%22latitude%2Clongitude%2Cdepth%22)' # Get lat/long and min/max depth for this dataset data_depth = pd.read_csv(distinctMinMaxDepth_url, header=[0, 1]) # Group depth data by lat/long and get min max values data_depth_reduced = data_depth.groupby( by=variable_to_groupby).agg(['min', 'max']).reset_index() # Merge depth values with time data_reduced = data_reduced.merge(data_depth_reduced, on=variable_to_groupby, how='left') # Merge multi index column names data_reduced.columns = data_reduced.columns.map( ' '.join).str.strip(' ') except Exception as exception_error: print('Failed to read: ' + str(exception_error)) # If there's only one location, it could get the range from metadata # Find lat/long range of this dataset, if it's point we don't need to look into it min_latitude = float(attribute_table['NC_GLOBAL', 'geospatial_lat_min'].Value) max_latitude = float(attribute_table['NC_GLOBAL', 'geospatial_lat_max'].Value) min_longitude = float(attribute_table['NC_GLOBAL', 'geospatial_lon_min'].Value) max_longitude = float(attribute_table['NC_GLOBAL', 'geospatial_lon_max'].Value) # If min/max lat/long are the same don't go in the dataset if (min_latitude == max_latitude) & (min_longitude == max_longitude): data_reduced = pd.DataFrame(columns=['Dataset ID']) data_reduced = {} data_reduced['latitude degrees_north'] = min_latitude data_reduced['longitude degrees_east'] = min_longitude if 'depth' in attribute_table.columns and 'actual_range' in attribute_table[ 'depth'] and ('m' == attribute_table['depth', 'units']['Value']): depth_range = np.array( str.split( attribute_table['depth', 'actual_range']['Value'], ',')).astype(np.float) data_reduced['depth m min'] = depth_range[0] data_reduced['depth m max'] = depth_range[1] # Convert to DataFrame data_reduced = pd.DataFrame(data_reduced, index=[0]) print('Retrieved metadata') else: # Won't handle data with multiple location that it can't retrieve the data continue # Add Standard Name Variable Name to table info['Attribute Name'] == 'geospatial_lat_min' for var in standard_variable_list: data_reduced[var] = ','.join( e.get_var_by_attr(dataset_id=row['Dataset ID'], standard_name=var)) # Add cdm_data_type to table data_reduced['cdm_data_type'] = ','.join( info[info['Attribute Name'] == 'cdm_data_type']['Value'].values) # Add Dataset id to the table data_reduced['Dataset ID'] = row['Dataset ID'] # Merge that dataset ID with previously downloaded data df = df.append(data_reduced) # Add server to dataFrame df['server'] = e.server # Save resulting dataframe to a CSV, file name is based on the server address file_name = re.sub('https*://', '', e.server) file_name = re.sub("[\./]", '_', file_name) file_name = 'Server_List_' + file_name + '.csv' print('Save result to ' + file_name) df.to_csv(file_name) return df