def get_valid_stdnames(server_name): """Find all the `standard_name` attributes that exist on this ERDDAP endpoint, using [ERDDAP's "categorize" service] (http://www.neracoos.org/erddap/categorize/index.html)""" server = servers[server_name] server_url = server.get("url") # global e e = ERDDAP(server=server_url, protocol="tabledap") url_standard_names = f"{server_url}/categorize/standard_name/index.csv" df = pd.read_csv(urlopen(url_standard_names), skiprows=[1, 2]) standard_names = list(df["Category"].values) standard_names = remove_qcstdnames(standard_names) valid_standard_names = [] count = 0 print( "Checking the variables available for this server. This might take up to a couple of minutes...\n", ) for standard_name in standard_names: count += 1 if count == np.floor(len(standard_names) / 2): print("Halfway there...\n") elif count == np.floor((len(standard_names) / 4) * 3): print("Almost done...\n") elif count == (len(standard_names)): print("Done!") features, datasets = stdname2geojson( e, standard_name, server.get("cdm_data_type"), server.get("min_time"), server.get("max_time"), server.get("skip_datasets"), ) if len(datasets ) > 0: # if there is at least one dataset with this data var = e.get_var_by_attr( dataset_id=datasets[0], standard_name=lambda v: str(v).lower() == standard_name.lower( ), ) if var != []: valid_standard_names.append(standard_name) del features, datasets return valid_standard_names, server, e
def get_valid_stdnames(server_name): """Find all the `standard_name` attributes that exist on this ERDDAP endpoint, using [ERDDAP's "categorize" service] (http://www.neracoos.org/erddap/categorize/index.html)""" server = servers[server_name] server_url = server.get("url") e = ERDDAP(server=server_url, protocol="tabledap") url_stdnames = f"{server_url}/categorize/standard_name/index.csv" df = pd.read_csv(urlopen(url_stdnames), skiprows=[1, 2]) stdnames = list(df["Category"].values) stdnames = remove_qcstdnames(stdnames) valid_stdnames = [] count = 0 display(pn.Column(pn.panel(progressbar.name), progressbar)) for stdname in stdnames: count += 1 progressbar.value = int(count / (len(stdnames)) * 100) df_stdname = get_datasets( e, stdname, server.get("cdm_data_type"), server.get("min_time"), server.get("max_time"), server.get("skip_datasets"), ) if not df_stdname.empty: var = e.get_var_by_attr( dataset_id=df_stdname.datasetID.values[0], standard_name=lambda v: str(v).lower() == stdname.lower(), ) if var != []: valid_stdnames.append(stdname) return valid_stdnames, server, e
info_url = e.get_info_url(dataset_id=gliders[0], response="csv") info = pd.read_csv(info_url) info.head() With the info URL we can filter the data using attributes. cdm_profile_variables = info.loc[ info["Attribute Name"] == "cdm_profile_variables", "Variable Name" ] print("".join(cdm_profile_variables)) In fact, that is such a common operation that `erddapy` brings its own method for filtering data by attributes. In the next three cells we request the variables names that has a `cdm_profile_variables`, a `standard_name` of `sea_water_temperature`, and an axis respectively. e.get_var_by_attr( dataset_id=gliders[0], cdm_profile_variables=lambda v: v is not None, ) e.get_var_by_attr( dataset_id="whoi_406-20160902T1700", standard_name="sea_water_temperature", ) axis = e.get_var_by_attr( dataset_id="whoi_406-20160902T1700", axis=lambda v: v in ["X", "Y", "Z", "T"], ) axis With this method one can, for example, request data from multiple datasets using the standard_name. def get_cf_vars(
info.head() # In[11]: cdm_profile_variables = info.loc[info['Attribute Name'] == 'cdm_profile_variables', 'Value'] print(''.join(cdm_profile_variables)) # # Selecting variables by attributes # In[12]: e.get_var_by_attr( dataset_id= 'CP02PMCI-WFP01-03-CTDPFK000-ctdpf_ckl_wfp_instrument-telemetered-deployment0008-tabledap', standard_name='sea_water_temperature') # # Easy to use CF conventions standards # In[13]: t_vars = [ e.get_var_by_attr(dataset_id=glider, standard_name='sea_water_temperature')[0] for glider in gliders ] t_vars # In[14]:
# Check what variables are available on the dataset: info_url = erd.get_info_url(response='html') show_iframe(info_url) info_url = erd.get_info_url(response='csv') info_df = to_df(info_url) info_df info_df[info_df['Row Type'] == 'variable'] # Take a look at the variables with standard names: variables = erd.get_var_by_attr(standard_name=lambda v: v is not None) variables # These are the standard variables for the CTDBP instrument - specifically for the CP01CNSM-NSIF-CTDBP. Next, lets query the server for _all_ available data from the CP01CNSM-NSIF-CTDBP. erd.variables = variables erd.get_download_url() # Put it all into a dataframe: data = erd.to_pandas() # + # Plot a basic time-series of the conductivity import matplotlib.pyplot as plt
class NDBC(): def __init__(self, station_id, deploy_id, WMO, currentTime, startTime, data_map, name_map): self.station_id = station_id self.deploy_id = deploy_id self.WMO = WMO self.now = currentTime self.startTime = startTime self.data_map = data_map self.name_map = name_map def adjust_pressure_to_sea_level(self, pres, temp, height): """Adjust barometric presure to sea-level.""" temp = temp + 273.15 slp = pres / np.exp(-height / (temp * 29.263)) return slp def calculate_wind_speed(self, eastward, northward): """Calculate absolute wind speed from component wind vector.""" u = np.square(eastward) v = np.square(northward) wind_speed = np.sqrt(u + v) return wind_speed def calculate_wind_direction(self, eastward, northward): """Calculate met wind direction from component wind vectors.""" u = eastward v = northward wind_direction = 180 / np.pi * np.arctan2(-u, -v) return wind_direction def _connect_erddap(self, server="http://ooivm1.whoi.net/erddap", protocol="tabledap"): """Connect to the erddap server.""" self._erddap = ERDDAP(server=server, protocol=protocol) def list_datasets(self): """Get the available datasets for the ERDDAP server.""" # First, make the connection self._connect_erddap() # Next, get the datasets datasets = pd.read_csv( self._erddap.get_search_url(search_for=self.station_id, response='csv'))['Dataset ID'] return datasets def get_dataset(self, dataset): """Get the data for specified datasets.""" # First, have to re-establish the erddap connection self._connect_erddap() # Next, get the data for a dataset self._erddap.dataset_id = dataset # Only want the variables with standard names variables = self._erddap.get_var_by_attr( standard_name=lambda v: v is not None) self._erddap.variables = variables # Limit the data request to the current deployment self._erddap.constraints = { 'deploy_id=': self.deploy_id, 'time>=': self.startTime.strftime('%Y-%m-%dT%H:%M:%SZ') } try: # Download the data data = self._erddap.to_pandas(index_col='time (UTC)', parse_dates=True) # Sometimes it just returns an empty dataframe instead of an error if data.size == 0: data = self._create_empty_dataset() except: # If there is no available data in the requested time window, need # to create an empty dataframe of the data data = self._create_empty_dataset() # Return the dataset data return data def process_METBK_data(self, df, freq='10T'): """Process the METBK into the correct format and values for NDBC.""" # Resample the data df_binned = df.resample(freq).mean() # Check that barometric pressure if 'barometric_pressure (mbar)' in df_binned.columns: # Adjust the barometric pressure to sea-level df_binned[ 'sea_level_pressure (hPa)'] = self.adjust_pressure_to_sea_level( df_binned['barometric_pressure (mbar)'], df_binned['air_temperature (degree_Celsius)'], 4.05) else: df_binned['sea_level_pressure (hPa)'] = np.nan # Check that the wind vector components are in the dataframe if 'eastward_wind_velocity (m s-1)' in df_binned.columns: # Calculate the wind speed df_binned['wind speed (m/s)'] = self.calculate_wind_speed( df_binned['eastward_wind_velocity (m s-1)'], df_binned['northward_wind_velocity (m s-1)']) # Calculate the wind direction df_binned['wind direction'] = self.calculate_wind_direction( df_binned['eastward_wind_velocity (m s-1)'], df_binned['northward_wind_velocity (m s-1)']) df_binned['wind direction'] = df_binned["wind direction"].apply( lambda x: x + 360 if x < 0 else x) # Don't need cardinal direction -> want direction in degrees # df_binned["wind direction"] = df_binned["wind direction"].apply( # lambda x: self.get_cardinal_direction(np.round(x, decimals=2))) else: df_binned['wind speed (m/s)'] = np.nan df_binned['wind direction'] = np.nan # Return the processed data return df_binned def process_WAVSS_data(self, df, freq='10T'): """Much simpler function for processing the WAVSS data.""" # Resample the data df_binned = df.resample(freq).mean() # Return the data return df_binned def _create_empty_dataset(self): """ Create a dataset of all nans if there is no data available for the requested dataset in the given time period. """ # Get the units for the corresponding variables info_url = self._erddap.get_info_url( dataset_id=self._erddap.dataset_id, response='csv') info = pd.read_csv(info_url) units = info[info['Attribute Name'] == 'units'] # Now, add the units to the variable names columns = [] for var in self._erddap.variables: unit = units[units['Variable Name'] == var]['Value'].values if len(unit) == 0: columns.append(f'{var}') elif var == 'time': pass else: columns.append(f'{var} ({unit[0]})') # Create an array of nans to fill out the empty dataframe empty_array = np.empty((2, len(columns))) empty_array[:] = np.nan # Put the empty array into a dataframe empty_df = pd.DataFrame(data=empty_array, columns=columns, index=[self.startTime, self.now]) empty_df.index.name = 'time (UTC)' return empty_df def process_datasets(self, datasets): """Process the data for individual datasets.""" self.datasets = datasets # Get the data for the individual datasets for dset in self.datasets.keys(): self.datasets.update({dset: self.get_dataset(dset)}) # Process the data for dset in self.datasets.keys(): if 'METBK' in dset: self.datasets[dset] = self.process_METBK_data( self.datasets[dset]) else: self.datasets[dset] = self.process_WAVSS_data( self.datasets[dset]) # Add a header to the data in the datasets for key in self.datasets.keys(): header = key.split('-', 2)[-1] for col in self.datasets.get(key).columns: self.datasets.get(key).rename( columns={col: ' '.join((header, col))}, inplace=True) def parse_data_to_xml(self, data): """ Function which takes in the 10-minute average buoy data, the station name, and two dictionaries which map the buoy column names to the xml tags, and outputs an xml file in the NDBC format. Returns: xml - a properly constructed xml file in the NDBC format for the given buoy data """ # Start the xml file xml = ['<?xml version="1.0" encoding="ISO-8859-1"?>'] # Iterate through the data for index in data.index: # Get the data associated with a row in the dataframe row = data.loc[index] # Reset a dictionary of the data xml_data = {} for key in self.data_map.keys(): xml_data.update({key: self.data_map.get(key)}) # Parse the data into the data dictionary for key in xml_data.keys(): # Get the column name which corresponds to the ndbc tag column = self.name_map.get(key) # Check that the column was returned from the ERDDAP server if column in row.index: value = row[column] # If a nan, just leave it the default -9999 if str(value) == 'nan': pass else: xml_data[key] = value # If no data, leave it as default -9999 else: pass # Write the parsed data to the xml file # Start the message xml.append('<message>') # Add in the station id xml.append(f' <station>{self.WMO}</station>') # Get the time index time = row.name.strftime('%m/%d/%Y %H:%M:%S') xml.append(f' <date>{time}</date>') # Missing fill value missing = str(-9999) xml.append(f' <missing>{missing}</missing>') # Roundtime xml.append(' <roundtime>no</roundtime>') # Start of the data xml.append(' <met>') # Add in each data piece for tag in xml_data.keys(): # Get the value value = xml_data.get(tag) value = str(value) # Add the data to the xml file xml.append(f' <{tag}>{value}</{tag}>') # Finish off the message xml.append(' </met>') xml.append('</message>') # Return the results return xml
# In[11]: cdm_profile_variables = info.loc[ info['Attribute Name'] == 'cdm_profile_variables', 'Value' ] print(''.join(cdm_profile_variables)) # # Selecting variables by attributes # In[12]: e.get_var_by_attr( dataset_id='CP02PMCI-WFP01-03-CTDPFK000-ctdpf_ckl_wfp_instrument-telemetered-deployment0008-tabledap', standard_name='sea_water_temperature' ) # # Easy to use CF conventions standards # In[13]: t_vars = [ e.get_var_by_attr( dataset_id=glider, standard_name='sea_water_temperature' )[0] for glider in gliders ] t_vars
def get_standard_variables_and_metadata(server_link, standard_variable_list): # Get access to the server and find datasets associated with standard_name variable listed e = ERDDAP(server=server_link, protocol='tabledap', response='csv') # Define Filter for which datasets to look into kw = { 'standard_name': ','.join(standard_variable_list), 'min_lon': -180.0, 'max_lon': 180.0, 'min_lat': -90.0, 'max_lat': 90.0, 'min_time': '', 'max_time': '', 'cdm_data_type': '' } variable_to_groupby = [('latitude', 'degrees_north'), ('longitude', 'degrees_east')] # Get available datasets from that server search_url = e.get_search_url(response='csv', **kw) datasets = pd.read_csv(search_url) # Print results print(e.server) print( str(len(datasets)) + " datasets contains " + ', '.join(standard_variable_list)) # Loop through different data sets and create a metadata dataFrame df = pd.DataFrame(columns=['Dataset ID']) for index, row in datasets.iterrows(): # Get Info from dataset (mostly min/max lat/long) print(row['Dataset ID']) info_url = e.get_info_url(dataset_id=row['Dataset ID'], response='csv') info = pd.read_csv(info_url) attribute_table = info.set_index( ['Row Type', 'Variable Name', 'Attribute Name']).transpose()['attribute'] # Try to get the distinct lat/long and time and depth range for that dataset, if it fails rely on the # ERDDAP metadata try: # If dataset is spread out geographically find distinct locations (may not work well for trajectory data) latlong_url = e.get_download_url( dataset_id=row['Dataset ID'], protocol='tabledap', variables=['latitude', 'longitude', 'time']) # Get add to the url commands to get distinct values and ordered with min and max time for each lat/long distinctMinMaxTime_url = latlong_url + '&distinct()&orderByMinMax(%22latitude%2Clongitude%2Ctime%22)' # Get lat/long and min/max depth for this dataset data = pd.read_csv(distinctMinMaxTime_url, header=[0, 1]) # Group data by latitude/longitude and get min max values data_reduced = data.groupby(by=variable_to_groupby).agg( ['min', 'max']).reset_index() if info[(info['Variable Name'] == 'depth')].size > 0: latlongdepth_url = e.get_download_url( dataset_id=row['Dataset ID'], protocol='tabledap', variables=['latitude', 'longitude', 'depth']) # Get add to the url commands to get distinct values and ordered with min and max depth for # each lat/long distinctMinMaxDepth_url = latlongdepth_url + \ '&distinct()&orderByMinMax(%22latitude%2Clongitude%2Cdepth%22)' # Get lat/long and min/max depth for this dataset data_depth = pd.read_csv(distinctMinMaxDepth_url, header=[0, 1]) # Group depth data by lat/long and get min max values data_depth_reduced = data_depth.groupby( by=variable_to_groupby).agg(['min', 'max']).reset_index() # Merge depth values with time data_reduced = data_reduced.merge(data_depth_reduced, on=variable_to_groupby, how='left') # Merge multi index column names data_reduced.columns = data_reduced.columns.map( ' '.join).str.strip(' ') except Exception as exception_error: print('Failed to read: ' + str(exception_error)) # If there's only one location, it could get the range from metadata # Find lat/long range of this dataset, if it's point we don't need to look into it min_latitude = float(attribute_table['NC_GLOBAL', 'geospatial_lat_min'].Value) max_latitude = float(attribute_table['NC_GLOBAL', 'geospatial_lat_max'].Value) min_longitude = float(attribute_table['NC_GLOBAL', 'geospatial_lon_min'].Value) max_longitude = float(attribute_table['NC_GLOBAL', 'geospatial_lon_max'].Value) # If min/max lat/long are the same don't go in the dataset if (min_latitude == max_latitude) & (min_longitude == max_longitude): data_reduced = pd.DataFrame(columns=['Dataset ID']) data_reduced = {} data_reduced['latitude degrees_north'] = min_latitude data_reduced['longitude degrees_east'] = min_longitude if 'depth' in attribute_table.columns and 'actual_range' in attribute_table[ 'depth'] and ('m' == attribute_table['depth', 'units']['Value']): depth_range = np.array( str.split( attribute_table['depth', 'actual_range']['Value'], ',')).astype(np.float) data_reduced['depth m min'] = depth_range[0] data_reduced['depth m max'] = depth_range[1] # Convert to DataFrame data_reduced = pd.DataFrame(data_reduced, index=[0]) print('Retrieved metadata') else: # Won't handle data with multiple location that it can't retrieve the data continue # Add Standard Name Variable Name to table info['Attribute Name'] == 'geospatial_lat_min' for var in standard_variable_list: data_reduced[var] = ','.join( e.get_var_by_attr(dataset_id=row['Dataset ID'], standard_name=var)) # Add cdm_data_type to table data_reduced['cdm_data_type'] = ','.join( info[info['Attribute Name'] == 'cdm_data_type']['Value'].values) # Add Dataset id to the table data_reduced['Dataset ID'] = row['Dataset ID'] # Merge that dataset ID with previously downloaded data df = df.append(data_reduced) # Add server to dataFrame df['server'] = e.server # Save resulting dataframe to a CSV, file name is based on the server address file_name = re.sub('https*://', '', e.server) file_name = re.sub("[\./]", '_', file_name) file_name = 'Server_List_' + file_name + '.csv' print('Save result to ' + file_name) df.to_csv(file_name) return df