def load_data(self, year='2019'): self.dfs = {} for index, row in self.df.iterrows(): if (self.glider_id in row['Dataset ID']) and (year in row['Dataset ID']): print(row['Dataset ID']) try: e = ERDDAP( server=self.server_url, protocol='tabledap', response='csv', ) e.dataset_id = row['Dataset ID'] e.constraints = self.constraints e.variables = self.variables[row['Dataset ID']] except HTTPError: print('Failed to generate url {}'.format( row['Dataset ID'])) continue self.dfs.update({ row['Dataset ID']: e.to_pandas( index_col='time (UTC)', parse_dates=True, skiprows=(1, ) # units information can be dropped. ) }) return (self.dfs)
def active_drifters(bbox=None, time_start=None, time_end=None): bbox = bbox or [-100, -40, 18, 60] time_end = time_end or dt.date.today() time_start = time_start or (time_end - dt.timedelta(days=1)) t0 = time_start.strftime('%Y-%m-%dT%H:%M:%SZ') t1 = time_end.strftime('%Y-%m-%dT%H:%M:%SZ') e = ERDDAP(server='OSMC', protocol="tabledap") e.dataset_id = "gdp_interpolated_drifter" # Setting constraints e.constraints = { "time>=": t0, "time<=": t1, 'longitude>=': bbox[0], 'longitude<=': bbox[1], 'latitude>=': bbox[2], 'latitude<=': bbox[3], } # e.variables = [ # "WMO", # "latitude", # "longitude", # "time", # ] try: df = e.to_pandas() except ValueError: return pd.DataFrame() return df
def retrieve_variable_names_erddap_server(url_erddap, dataset_id): """ Created on Tue Nov 3 11:26:05 2020 @author: aristizabal This function retrieves the variable names from the IOOS and Rutgers erddapp glider servers. Inputs: url_erddap: url address of erddap server Example: 'https://data.ioos.us/gliders/erddap' dataset_id: Example: 'ng231-20190901T0000' Outputs: variables: list of variables for the requested dataset_id """ from erddapy import ERDDAP e = ERDDAP(server=url_erddap, protocol='tabledap', response='nc') e.dataset_id = dataset_id df = e.to_pandas() variable_names = [var for var in df.columns] print('List of available variables ') print(variable_names) return variable_names
def active_argo_floats(bbox=None, time_start=None, time_end=None, floats=None): """ :param lon_lims: list containing westernmost longitude and easternmost latitude :param lat_lims: list containing southernmost latitude and northernmost longitude :param time_start: time to start looking for floats :param time_end: time to end looking for floats :return: """ bbox = bbox or [-100, -45, 5, 46] time_end = time_end or dt.date.today() time_start = time_start or (time_end - dt.timedelta(days=1)) floats = floats or False constraints = { 'time>=': str(time_start), 'time<=': str(time_end), } if bbox: constraints['longitude>='] = bbox[0] constraints['longitude<='] = bbox[1] constraints['latitude>='] = bbox[2] constraints['latitude<='] = bbox[3] if floats: constraints['platform_number='] = floats variables = [ 'platform_number', 'time', 'pres', 'longitude', 'latitude', 'temp', 'psal', ] e = ERDDAP( server='IFREMER', protocol='tabledap', response='nc' ) e.dataset_id = 'ArgoFloats' e.constraints = constraints e.variables = variables try: df = e.to_pandas( parse_dates=['time (UTC)'], skiprows=(1,) # units information can be dropped. ).dropna() except HTTPError: df = pd.DataFrame() return df
def get_erddap_dataset(ds_id, variables=None, constraints=None, filetype=None): """ Returns a netcdf dataset for a specified dataset ID (or dataframe if dataset cannot be converted to xarray) :param ds_id: dataset ID e.g. ng314-20200806T2040 :param variables: optional list of variables :param constraints: optional list of constraints :param filetype: optional filetype to return, 'nc' (default) or 'dataframe' :return: netcdf dataset """ variables = variables or None constraints = constraints or None filetype = filetype or 'nc' e = ERDDAP(server='NGDAC', protocol='tabledap', response='nc') e.dataset_id = ds_id if constraints: e.constraints = constraints if variables: e.variables = variables if filetype == 'nc': try: ds = e.to_xarray() ds = ds.sortby(ds.time) except OSError: print('No dataset available for specified constraints: {}'.format( ds_id)) ds = [] except TypeError: print('Cannot convert to xarray, providing dataframe: {}'.format( ds_id)) ds = e.to_pandas().dropna() elif filetype == 'dataframe': ds = e.to_pandas().dropna() else: print('Unrecognized filetype: {}. Needs to be "nc" or "dataframe"'. format(filetype)) return ds
def check_dataset_empty(url_erddap,dataset_id,date_ini,date_end,lon_lim,lat_lim): from erddapy import ERDDAP constraints = { 'time>=': date_ini, 'time<=': date_end, 'latitude>=': lat_lim[0], 'latitude<=': lat_lim[1], 'longitude>=': lon_lim[0], 'longitude<=': lon_lim[1], } variable_names = [ 'depth', 'latitude', 'longitude', 'time', 'temperature', 'salinity' ] e = ERDDAP( server=url_erddap, protocol='tabledap', response='nc' ) e.dataset_id = dataset_id e.constraints = constraints e.variables = variable_names # Converting glider data to data frame # Cheching that data frame has data df = e.to_pandas() if len(df) < 4: empty_dataset = True else: empty_dataset = False return empty_dataset
def load_data(self,year='2019'): self.dfs = {} for index,row in self.df.iterrows(): if (self.glider_id in row['Dataset ID']) and (year in row['Dataset ID']): print(row['Dataset ID']) try: e = ERDDAP(server=self.server_url, protocol='tabledap', response='csv', ) e.dataset_id=row['Dataset ID'] e.constraints=self.constraints e.variables=self.variables[row['Dataset ID']] except HTTPError: print('Failed to generate url {}'.format(row['Dataset ID'])) continue self.dfs.update({row['Dataset ID']: e.to_pandas( index_col='time (UTC)', parse_dates=True, skiprows=(1,) # units information can be dropped. )}) return(self.dfs)
def get_erddap_data(dataset_id): ''' :param dataset_id: the deployment name example:'ce_311-20200708T1723' :return: pandas DataFrame with deployment variable values ''' e = ERDDAP( server='https://gliders.ioos.us/erddap', protocol='tabledap', ) e.response = 'csv' e.dataset_id = dataset_id e.variables = [ 'depth', 'latitude', 'longitude', 'salinity', 'temperature', 'conductivity', 'density', 'time', ] df = e.to_pandas() return df
def get_ndbc(bbox=None, time_start=None, time_end=None, buoy=None): bbox = bbox or [-100, -45, 5, 46] time_end = time_end or dt.date.today() time_start = time_start or (time_end - dt.timedelta(days=1)) buoy = buoy or False time_formatter = '%Y-%m-%dT%H:%M:%SZ' e = ERDDAP( server='CSWC', protocol='tabledap', response='csv' ) e.dataset_id = 'cwwcNDBCMet' e.constraints = { 'time>=': time_start.strftime(time_formatter), 'time<=': time_end.strftime(time_formatter), } if bbox: e.constraints['longitude>='] = bbox[0] e.constraints['longitude<='] = bbox[1] e.constraints['latitude>='] = bbox[2] e.constraints['latitude<='] = bbox[3] e.variables = [ "station", "latitude", "longitude", "time" ] if buoy: e.constraints['station='] = buoy df = e.to_pandas( parse_dates=['time (UTC)'], skiprows=(1,) # units information can be dropped. ).dropna() stations = df.station.unique() # e.variables = [ # "station", # "latitude", # "longitude", # "wd", # "wspd", # "gst", # "wvht", # "dpd", # "apd", # "mwd", # "bar", # "atmp", # "wtmp", # "dewp", # # "vis", # # "ptdy", # # "tide", # "wspu", # "wspv", # "time", # ] try: df = e.to_pandas( parse_dates=['time (UTC)'], skiprows=(1,) # units information can be dropped. ).dropna() except HTTPError: df = pd.DataFrame() return df
class GliderDataFetcher(object): """ Args: server: a glider ERDDAP server URL Attributes: dataset_id: a dataset unique id. constraints: download constraints, default None (opendap-like url) """ def __init__(self, server=_server): self.fetcher = ERDDAP( server=server, protocol="tabledap", ) if "ifremer" in self.fetcher.server: self.fetcher.variables = ifremer_vars else: self.fetcher.variables = [ "depth", "latitude", "longitude", "salinity", "temperature", "time", ] self.fetcher.dataset_id: OptionalStr = None def to_pandas(self): """ Fetches data from the server and reads into a pandas dataframe :return: pandas dataframe with datetime UTC as index """ return self.fetcher.to_pandas( index_col="time (UTC)", parse_dates=True, ) def query(self, min_lat, max_lat, min_lon, max_lon, start_time, end_time): """ Takes user supplied geographical and time constraints and adds them to the query :param min_lat: southernmost lat :param max_lat: northermost lat :param min_lon: westernmost lon (-180 to +180) :param max_lon: easternmost lon (-180 to +180) :param start_time: start time, can be datetime object or string :param end_time: end time, can be datetime object or string :return: search query with argument constraints applied """ self.fetcher.constraints = { "time>=": start_time, "time<=": end_time, "latitude>=": min_lat, "latitude<=": max_lat, "longitude>=": min_lon, "longitude<=": max_lon, } return self def platform(self, platform): """ :param platform: platform and deployment id from ifremer :return: search query with platform constraint applied """ self.fetcher.constraints["platform_deployment="] = platform return self
class WireWallMonitor: """A class to handle retrieval and plotting of WireWall data.""" window_time_column = "time (UTC)" event_time_column = "event time (UTC)" series_column = "wireID (Dmnless)" datetime_fields = ["time (UTC)", "gpsTime (UTC)", "timestamp (UTC)"] def __init__(self, erddap_server, constraints=None, protocol="tabledap", response="csv"): """Initialise based on given ERDDAP instance.""" self._erddap = ERDDAP( server=erddap_server, protocol=protocol, response=response, ) self._erddap.constraints = constraints or [] def _add_event_columns(self, df): """Add a new columns which apply to events.""" # calculate the event height with the baseline removed df["event depth preferred (cm)"] = df["elMEAN (cm)"] - df[ "MEDelMEAN (cm)"] df["event depth fallback (cm)"] = df["elPTILE_6 (cm)"] - df[ "MEDelPTILE_2 (cm)"] df[self.event_time_column] = df[self.window_time_column].copy() time_delta = df["sampleNUM (Dmnless)"] - df["sampleNUM10 (Dmnless)"] # events occur at ~400Hz time_delta /= 400 # we need this as an actual timedelta time_delta = time_delta.apply(pd.to_timedelta, unit="S") # check all events occur in the interval [0, 10] mins from the window start time if time_delta.max() > pd.to_timedelta("10m"): warn( "Data has an event that occurs after the 10min sample window.", UserWarning, ) if time_delta.min() < pd.to_timedelta("0m"): warn( "Data has an event that occurs before the 10min sample window.", UserWarning, ) df[self.event_time_column] += time_delta def _get_dataframe(self, dataset_id): """Retrieve a dataframe for the given dataset_id.""" self._erddap.dataset_id = dataset_id df = self._erddap.to_pandas(parse_dates=self.datetime_fields) df[self.series_column] = df[self.series_column].astype(str) self._add_event_columns(df) self._erddap.dataset_id = None return df def _plot_dataframe(self, df, x, y): """Plot the given columns of the dataframe.""" fig = px.scatter(df, x=x, y=y, color=self.series_column) fig.update_layout( yaxis_title=y, xaxis=_XAXIS_FORMAT, margin=_MARGIN_FORMAT, ) return fig def _plot_window_variables(self, df, column_names, column_names_secondary): """Plot variables that are constant over a window timespan.""" # since these variables are constant over a given window, for a given wire # we can remove any rows which are duplicated df = df.drop_duplicates([self.window_time_column, self.series_column], keep="first") figs = [None] * len(column_names) # use a loop so we can update each fig for i, (name, name_secondary) in enumerate( zip(column_names, column_names_secondary)): subfig1 = self._plot_dataframe( df, x=self.window_time_column, y=name, ) yaxis_title = name if name_secondary is None: fig = subfig1 else: fig = make_subplots() # get the units from the columns units = {s.split(" ")[1] for s in [name, name_secondary]} yaxis_title = "value " + " or ".join(units) subfig2 = self._plot_dataframe( df, x=self.window_time_column, y=name_secondary, ) # since this plot now has two series, rename them both subfig2.for_each_trace(lambda trace: trace.update( name=f"Wire {trace.name} {name_secondary}")) subfig1.for_each_trace(lambda trace: trace.update( name=f"Wire {trace.name} {name}")) # distinguish the second trace from the first subfig2.update_traces( marker_symbol="square", line_dash="dot", ) # combine the traces into one figure fig.add_traces(subfig1.data + subfig2.data) fig.update_traces(mode="lines+markers", selector=dict(type="scatter")) fig.update_layout( yaxis_title=yaxis_title, xaxis_title=self.window_time_column, xaxis=_XAXIS_FORMAT, margin=_MARGIN_FORMAT, ) figs[i] = fig return figs def _plot_event_variables(self, df, column_names): """Plot event variables.""" # some windows don't have any events and so there may be rows without any # sample num value. We are only interested in actual events, so remove them df = df.dropna(subset=[self.event_time_column]).copy() figs = [None] * len(column_names) # use a loop so we can update each fig for i, name in enumerate(column_names): figs[i] = self._plot_dataframe(df, x=self.event_time_column, y=name) return figs def plot_variables( self, dataset_id, window_variables=None, window_variables_secondary=None, event_variables=None, ): """Plot all the window and event variables for a given dataset. Args: dataset_id (str): the string identifier for the ERDDAP dataset. window_variables (list): a list of variable names (including units) which are constant over each window, to be plotted. event_variables (list): a list of variable names (including units) which are specific to each event, to be plotted. Returns: a list of figures generated, and calls .show() on all of them. """ window_variables = window_variables or [] window_variables_secondary = window_variables_secondary or [ None ] * len(window_variables) event_variables = event_variables or [] df = self._get_dataframe(dataset_id) window_figs = self._plot_window_variables(df, window_variables, window_variables_secondary) event_figs = self._plot_event_variables(df, event_variables) figs = [*window_figs, *event_figs] for fig in figs: fig.show() return figs
protocol='tabledap', response='mat', ) print(e.get_download_url()) # # Obtaining the data # # There are a few methods to obtain the data with *to_pandas()* and *to_xarray()*: # In[3]: df = e.to_pandas( index_col='time', parse_dates=True, skiprows=(1,) # units information can be dropped. ).dropna() # In[4]: df.head() # # Let's plot the data # # Exploring an ERDDAP server # In[5]:
'WindSpeed', 'WaveHeight', 'WavePeriod', 'MeanWaveDirection', # 'Hmax', # 'AirTemperature', 'SeaTemperature' ] url = e.get_download_url() print(url) df = e.to_pandas( index_col='time (UTC)', parse_dates=True).dropna() df.shape df.columns cols = ['AtmosphericPressure', 'WindDirection', 'WindSpeed', 'WaveHeight', 'WavePeriod', 'MeanWaveDirection', 'SeaTemperature'] # rename columns df.columns = cols df['Year'] = df.index.year df['Month'] = df.index.month df['Day'] = df.index.day
iframe = '<iframe src="{src}" width="100%" height="950"></iframe>'.format return HTML(iframe(src=src)) show_iframe(e.get_download_url(response="html")) Additionally, the object has `.get_info_url()` and `.get_search_url()` that can be used to obtain the info and search URLs respectively show_iframe(e.get_info_url(response="html")) show_iframe(e.get_search_url(response="html")) `erddapy` also brings some simple methods to download the data in some common data formats, like `pandas.DataFrame` and `xarray.Dataset`. df = e.to_pandas(index_col="time (UTC)", parse_dates=True,).dropna() df.head() ds = e.to_xarray(decode_times=False) ds["temperature"] Here is a simple plot using the data from `xarray`. %matplotlib inline import matplotlib.dates as mdates import matplotlib.pyplot as plt
e = ERDDAP( server="http://erddap.aoos.org/erddap/", protocol="tabledap" ) e.dataset_id = "kotzebue-alaska-water-level" e.constraints = { "time>=": "2018-09-05T21:00:00Z", "time<=": "2019-07-10T19:00:00Z", } e.variables = [ variable_name, "time", "z", ] data = e.to_pandas( index_col="time (UTC)", parse_dates=True, ) data["timestamp"] = data.index.astype("int64") // 1e9 data.to_csv(fname) data.head() from ioos_qc.config import QcConfig qc = QcConfig(qc_config) qc_results = qc.run( inp=data["sea_surface_height_above_sea_level_geoid_mhhw (m)"], tinp=data["timestamp"], zinp=data["z (m)"],
def read_glider_variables_erddap_server(url_erddap,dataset_id,\ lat_lim,lon_lim,\ variable_names=['time'], **kwargs): """ Created on Tue Nov 3 11:26:05 2020 @author: aristizabal This function reads glider variables from the IOOS and Rutgers erddapp glider servers. Inputs: url_erddap: url address of erddap server Example: 'https://data.ioos.us/gliders/erddap' dataset_id: Example: 'ng231-20190901T0000' variable_names: list of variable names. Example: variable_names = ['depth', 'latitude', 'longitude', 'time', 'temperature', 'salinity'] The default value is variable_names=['time'] lat_lim: latitude limits for the search. Example, lat_lim = [38.0,40.0] lon_lim: longitude limits for the search. Example, lon_lim = [-75.0,-72.0] date_ini: initial date of time window. This function accepts the data formats '%Y-%m-%d T %H:%M:%S Z' and '%Y/%m/%d/%H'. Examaple: date_ini = '2018-08-02T00:00:00Z' or '2018/08/02/00' date_end: initial date of time window. This function uses the data format '%Y-%m-%d T %H:%M:%S Z'. Examaple: date_ini = '2018-08-10T00:00:00Z' and '2018/08/10/00' Outputs: df: Pandas data frame with all the variables requested as vectors """ from erddapy import ERDDAP import numpy as np date_ini = kwargs.get('date_ini', None) date_end = kwargs.get('date_end', None) # Find time window of interest if np.logical_or(date_ini == None, date_end == None): constraints = { 'latitude>=': lat_lim[0], 'latitude<=': lat_lim[1], 'longitude>=': lon_lim[0], 'longitude<=': lon_lim[1], } else: constraints = { 'time>=': date_ini, 'time<=': date_end, 'latitude>=': lat_lim[0], 'latitude<=': lat_lim[1], 'longitude>=': lon_lim[0], 'longitude<=': lon_lim[1], } e = ERDDAP(server=url_erddap, protocol='tabledap', response='nc') e.dataset_id = dataset_id e.constraints = constraints e.variables = variable_names # Converting glider data to data frame # Cheching that data frame has data df = e.to_pandas() if len(df) > 3: df = e.to_pandas(parse_dates=True) return df
'longitude>=': lon_lim[0], 'longitude<=': lon_lim[-1], } variables = ['time', 'latitude', 'longitude'] #%% e = ERDDAP(server=server, protocol='tabledap', response='nc') for id in gliders: e.dataset_id = id e.constraints = constraints e.variables = variables df = e.to_pandas(parse_dates=True) print(id, df.index[-1]) #%% Reading bathymetry data ncbath = xr.open_dataset(bath_file) bath_lat = ncbath.variables['lat'][:] bath_lon = ncbath.variables['lon'][:] bath_elev = ncbath.variables['elevation'][:] oklatbath = np.logical_and(bath_lat >= lat_lim[0], bath_lat <= lat_lim[-1]) oklonbath = np.logical_and(bath_lon >= lon_lim[0], bath_lon <= lon_lim[-1]) bath_latsub = bath_lat[oklatbath] bath_lonsub = bath_lon[oklonbath]
'time>=': str(x.tm_year)+"-"+str(x.tm_mon).zfill(2)+"-"+str(x.tm_mday).zfill(2)+"T"+str(x.tm_hour).zfill(2)+":00:00Z", 'longitude>=': -80.0, 'longitude<=': 80.0, 'platform_type=': "DRIFTING BUOYS (GENERIC)", 'platform_code=': str(wmo_mb[i]), } e.variables = [ 'platform_code', 'time', 'latitude', 'longitude', 'sst', 'slp', ] try: df = e.to_pandas() except: print("Não há dados para o WMO "+str(wmo_mb[i])) try: df.columns = ['id', 'tempo','lat','lon','sst','pres'] df.id=sat[i] dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ') df['tempo'] = pd.to_datetime(df['tempo']) df = df.set_index('tempo') df['lat']=df.lat.round(4)
class NDBC(): def __init__(self, station_id, deploy_id, WMO, currentTime, startTime, data_map, name_map): self.station_id = station_id self.deploy_id = deploy_id self.WMO = WMO self.now = currentTime self.startTime = startTime self.data_map = data_map self.name_map = name_map def adjust_pressure_to_sea_level(self, pres, temp, height): """Adjust barometric presure to sea-level.""" temp = temp + 273.15 slp = pres / np.exp(-height / (temp * 29.263)) return slp def calculate_wind_speed(self, eastward, northward): """Calculate absolute wind speed from component wind vector.""" u = np.square(eastward) v = np.square(northward) wind_speed = np.sqrt(u + v) return wind_speed def calculate_wind_direction(self, eastward, northward): """Calculate met wind direction from component wind vectors.""" u = eastward v = northward wind_direction = 180 / np.pi * np.arctan2(-u, -v) return wind_direction def _connect_erddap(self, server="http://ooivm1.whoi.net/erddap", protocol="tabledap"): """Connect to the erddap server.""" self._erddap = ERDDAP(server=server, protocol=protocol) def list_datasets(self): """Get the available datasets for the ERDDAP server.""" # First, make the connection self._connect_erddap() # Next, get the datasets datasets = pd.read_csv( self._erddap.get_search_url(search_for=self.station_id, response='csv'))['Dataset ID'] return datasets def get_dataset(self, dataset): """Get the data for specified datasets.""" # First, have to re-establish the erddap connection self._connect_erddap() # Next, get the data for a dataset self._erddap.dataset_id = dataset # Only want the variables with standard names variables = self._erddap.get_var_by_attr( standard_name=lambda v: v is not None) self._erddap.variables = variables # Limit the data request to the current deployment self._erddap.constraints = { 'deploy_id=': self.deploy_id, 'time>=': self.startTime.strftime('%Y-%m-%dT%H:%M:%SZ') } try: # Download the data data = self._erddap.to_pandas(index_col='time (UTC)', parse_dates=True) # Sometimes it just returns an empty dataframe instead of an error if data.size == 0: data = self._create_empty_dataset() except: # If there is no available data in the requested time window, need # to create an empty dataframe of the data data = self._create_empty_dataset() # Return the dataset data return data def process_METBK_data(self, df, freq='10T'): """Process the METBK into the correct format and values for NDBC.""" # Resample the data df_binned = df.resample(freq).mean() # Check that barometric pressure if 'barometric_pressure (mbar)' in df_binned.columns: # Adjust the barometric pressure to sea-level df_binned[ 'sea_level_pressure (hPa)'] = self.adjust_pressure_to_sea_level( df_binned['barometric_pressure (mbar)'], df_binned['air_temperature (degree_Celsius)'], 4.05) else: df_binned['sea_level_pressure (hPa)'] = np.nan # Check that the wind vector components are in the dataframe if 'eastward_wind_velocity (m s-1)' in df_binned.columns: # Calculate the wind speed df_binned['wind speed (m/s)'] = self.calculate_wind_speed( df_binned['eastward_wind_velocity (m s-1)'], df_binned['northward_wind_velocity (m s-1)']) # Calculate the wind direction df_binned['wind direction'] = self.calculate_wind_direction( df_binned['eastward_wind_velocity (m s-1)'], df_binned['northward_wind_velocity (m s-1)']) df_binned['wind direction'] = df_binned["wind direction"].apply( lambda x: x + 360 if x < 0 else x) # Don't need cardinal direction -> want direction in degrees # df_binned["wind direction"] = df_binned["wind direction"].apply( # lambda x: self.get_cardinal_direction(np.round(x, decimals=2))) else: df_binned['wind speed (m/s)'] = np.nan df_binned['wind direction'] = np.nan # Return the processed data return df_binned def process_WAVSS_data(self, df, freq='10T'): """Much simpler function for processing the WAVSS data.""" # Resample the data df_binned = df.resample(freq).mean() # Return the data return df_binned def _create_empty_dataset(self): """ Create a dataset of all nans if there is no data available for the requested dataset in the given time period. """ # Get the units for the corresponding variables info_url = self._erddap.get_info_url( dataset_id=self._erddap.dataset_id, response='csv') info = pd.read_csv(info_url) units = info[info['Attribute Name'] == 'units'] # Now, add the units to the variable names columns = [] for var in self._erddap.variables: unit = units[units['Variable Name'] == var]['Value'].values if len(unit) == 0: columns.append(f'{var}') elif var == 'time': pass else: columns.append(f'{var} ({unit[0]})') # Create an array of nans to fill out the empty dataframe empty_array = np.empty((2, len(columns))) empty_array[:] = np.nan # Put the empty array into a dataframe empty_df = pd.DataFrame(data=empty_array, columns=columns, index=[self.startTime, self.now]) empty_df.index.name = 'time (UTC)' return empty_df def process_datasets(self, datasets): """Process the data for individual datasets.""" self.datasets = datasets # Get the data for the individual datasets for dset in self.datasets.keys(): self.datasets.update({dset: self.get_dataset(dset)}) # Process the data for dset in self.datasets.keys(): if 'METBK' in dset: self.datasets[dset] = self.process_METBK_data( self.datasets[dset]) else: self.datasets[dset] = self.process_WAVSS_data( self.datasets[dset]) # Add a header to the data in the datasets for key in self.datasets.keys(): header = key.split('-', 2)[-1] for col in self.datasets.get(key).columns: self.datasets.get(key).rename( columns={col: ' '.join((header, col))}, inplace=True) def parse_data_to_xml(self, data): """ Function which takes in the 10-minute average buoy data, the station name, and two dictionaries which map the buoy column names to the xml tags, and outputs an xml file in the NDBC format. Returns: xml - a properly constructed xml file in the NDBC format for the given buoy data """ # Start the xml file xml = ['<?xml version="1.0" encoding="ISO-8859-1"?>'] # Iterate through the data for index in data.index: # Get the data associated with a row in the dataframe row = data.loc[index] # Reset a dictionary of the data xml_data = {} for key in self.data_map.keys(): xml_data.update({key: self.data_map.get(key)}) # Parse the data into the data dictionary for key in xml_data.keys(): # Get the column name which corresponds to the ndbc tag column = self.name_map.get(key) # Check that the column was returned from the ERDDAP server if column in row.index: value = row[column] # If a nan, just leave it the default -9999 if str(value) == 'nan': pass else: xml_data[key] = value # If no data, leave it as default -9999 else: pass # Write the parsed data to the xml file # Start the message xml.append('<message>') # Add in the station id xml.append(f' <station>{self.WMO}</station>') # Get the time index time = row.name.strftime('%m/%d/%Y %H:%M:%S') xml.append(f' <date>{time}</date>') # Missing fill value missing = str(-9999) xml.append(f' <missing>{missing}</missing>') # Roundtime xml.append(' <roundtime>no</roundtime>') # Start of the data xml.append(' <met>') # Add in each data piece for tag in xml_data.keys(): # Get the value value = xml_data.get(tag) value = str(value) # Add the data to the xml file xml.append(f' <{tag}>{value}</{tag}>') # Finish off the message xml.append(' </met>') xml.append('</message>') # Return the results return xml
from datetime import date from erddapy import ERDDAP server = "http://osmc.noaa.gov/erddap" e = ERDDAP(server=server, protocol="tabledap") e.dataset_id = "ioos_obs_counts" e.variables = ["time", "locationID", "region", "sponsor", "met", "wave"] e.constraints = { "time>=": "2019-09", "time<": "2020-11", } df = e.to_pandas(parse_dates=True) df["locationID"] = df["locationID"].str.lower() df.tail() The table has all the ingest data from 2019-01-01 to 2020-06-01. We can now explore it grouping the data by IOOS Regional Association (RA). groups = df.groupby("region") ax = groups.sum().plot(kind="bar", figsize=(11, 3.75)) ax.yaxis.get_major_formatter().set_scientific(False) ax.set_ylabel("# observations"); Let us check the monthly sum of data released both for individuak met and wave and the totdals.
def active_gliders(bbox=None, time_start=None, time_end=dt.date.today(), glider_id=None): bbox = bbox or [-100, -40, 18, 60] time_start = time_start or (time_end - dt.timedelta(days=1)) t0 = time_start.strftime('%Y-%m-%dT%H:%M:%SZ') t1 = time_end.strftime('%Y-%m-%dT%H:%M:%SZ') glider_id = glider_id or None e = ERDDAP(server='NGDAC') # Grab every dataset available # datasets = pd.read_csv(e.get_search_url(response='csv', search_for='all')) # Search constraints kw = dict() kw['min_time'] = t0 kw['max_time'] = t1 if bbox: kw['min_lon'] = bbox[0] kw['max_lon'] = bbox[1] kw['min_lat'] = bbox[2] kw['max_lat'] = bbox[3] if glider_id: search = glider_id else: search = None search_url = e.get_search_url(search_for=search, response='csv', **kw) try: # Grab the results search = pd.read_csv(search_url) except: # return empty dataframe if there are no results return pd.DataFrame() # Extract the IDs gliders = search['Dataset ID'].values msg = 'Found {} Glider Datasets:\n\n{}'.format print(msg(len(gliders), '\n'.join(gliders))) # Setting constraints constraints = { 'time>=': t0, 'time<=': t1, 'longitude>=': bbox[0], 'longitude<=': bbox[1], 'latitude>=': bbox[2], 'latitude<=': bbox[3], } variables = [ 'depth', 'latitude', 'longitude', 'time', 'temperature', 'salinity', ] e = ERDDAP( server='NGDAC', protocol='tabledap', response='nc' ) glider_dfs = [] for id in gliders: # print('Reading ' + id) e.dataset_id = id e.constraints = constraints e.variables = variables # checking data frame is not empty try: df = e.to_pandas( index_col='time (UTC)', parse_dates=True, skiprows=(1,) # units information can be dropped. ).dropna() except: continue df = df.reset_index() df['dataset_id'] = id df = df.set_index(['dataset_id', 'time (UTC)']) glider_dfs.append(df) try: ndf = pd.concat(glider_dfs) except ValueError: return pd.DataFrame() return ndf
def GOFS_RTOFS_vs_Argo_floats(lon_forec_track, lat_forec_track, lon_forec_cone, lat_forec_cone, lon_best_track, lat_best_track, lon_lim, lat_lim, folder_fig): #%% User input #GOFS3.1 output model location url_GOFS_ts = 'http://tds.hycom.org/thredds/dodsC/GLBy0.08/expt_93.0/ts3z' # RTOFS files folder_RTOFS = '/home/coolgroup/RTOFS/forecasts/domains/hurricanes/RTOFS_6hourly_North_Atlantic/' nc_files_RTOFS = ['rtofs_glo_3dz_f006_6hrly_hvr_US_east.nc',\ 'rtofs_glo_3dz_f012_6hrly_hvr_US_east.nc',\ 'rtofs_glo_3dz_f018_6hrly_hvr_US_east.nc',\ 'rtofs_glo_3dz_f024_6hrly_hvr_US_east.nc'] # COPERNICUS MARINE ENVIRONMENT MONITORING SERVICE (CMEMS) url_cmems = 'http://nrt.cmems-du.eu/motu-web/Motu' service_id = 'GLOBAL_ANALYSIS_FORECAST_PHY_001_024-TDS' product_id = 'global-analysis-forecast-phy-001-024' depth_min = '0.493' out_dir = '/home/aristizabal/crontab_jobs' # Bathymetry file #bath_file = '/Users/aristizabal/Desktop/MARACOOS_project/Maria_scripts/nc_files/GEBCO_2014_2D_-100.0_0.0_-60.0_45.0.nc' bath_file = '/home/aristizabal/bathymetry_files/GEBCO_2014_2D_-100.0_0.0_-10.0_50.0.nc' # Argo floats url_Argo = 'http://www.ifremer.fr/erddap' #%% from matplotlib import pyplot as plt import numpy as np import xarray as xr import netCDF4 from datetime import datetime, timedelta import cmocean import matplotlib.dates as mdates from erddapy import ERDDAP import pandas as pd import os # Do not produce figures on screen plt.switch_backend('agg') # Increase fontsize of labels globally plt.rc('xtick', labelsize=14) plt.rc('ytick', labelsize=14) plt.rc('legend', fontsize=14) #%% Reading bathymetry data ncbath = xr.open_dataset(bath_file) bath_lat = ncbath.variables['lat'][:] bath_lon = ncbath.variables['lon'][:] bath_elev = ncbath.variables['elevation'][:] oklatbath = np.logical_and(bath_lat >= lat_lim[0], bath_lat <= lat_lim[-1]) oklonbath = np.logical_and(bath_lon >= lon_lim[0], bath_lon <= lon_lim[-1]) bath_latsub = bath_lat[oklatbath] bath_lonsub = bath_lon[oklonbath] bath_elevs = bath_elev[oklatbath, :] bath_elevsub = bath_elevs[:, oklonbath] #%% Get time bounds for current day #ti = datetime.today() ti = datetime.today() - timedelta(1) - timedelta(hours=6) tini = datetime(ti.year, ti.month, ti.day) te = ti + timedelta(2) tend = datetime(te.year, te.month, te.day) #%% Look for Argo datasets e = ERDDAP(server=url_Argo) # Grab every dataset available #datasets = pd.read_csv(e.get_search_url(response='csv', search_for='all')) kw = { 'min_lon': lon_lim[0], 'max_lon': lon_lim[1], 'min_lat': lat_lim[0], 'max_lat': lat_lim[1], 'min_time': str(tini), 'max_time': str(tend), } search_url = e.get_search_url(response='csv', **kw) # Grab the results search = pd.read_csv(search_url) # Extract the IDs dataset = search['Dataset ID'].values msg = 'Found {} Datasets:\n\n{}'.format print(msg(len(dataset), '\n'.join(dataset))) dataset_type = dataset[0] constraints = { 'time>=': str(tini), 'time<=': str(tend), 'latitude>=': lat_lim[0], 'latitude<=': lat_lim[1], 'longitude>=': lon_lim[0], 'longitude<=': lon_lim[1], } variables = [ 'platform_number', 'time', 'pres', 'longitude', 'latitude', 'temp', 'psal', ] e = ERDDAP(server=url_Argo, protocol='tabledap', response='nc') e.dataset_id = dataset_type e.constraints = constraints e.variables = variables print(e.get_download_url()) df = e.to_pandas( parse_dates=True, skiprows=(1, ) # units information can be dropped. ).dropna() argo_ids = np.asarray(df['platform_number']) argo_times = np.asarray(df['time (UTC)']) argo_press = np.asarray(df['pres (decibar)']) argo_lons = np.asarray(df['longitude (degrees_east)']) argo_lats = np.asarray(df['latitude (degrees_north)']) argo_temps = np.asarray(df['temp (degree_Celsius)']) argo_salts = np.asarray(df['psal (PSU)']) #%% GOGF 3.1 try: GOFS_ts = xr.open_dataset(url_GOFS_ts, decode_times=False) lt_GOFS = np.asarray(GOFS_ts['lat'][:]) ln_GOFS = np.asarray(GOFS_ts['lon'][:]) tt = GOFS_ts['time'] t_GOFS = netCDF4.num2date(tt[:], tt.units) depth_GOFS = np.asarray(GOFS_ts['depth'][:]) except Exception as err: print(err) GOFS_ts = np.nan lt_GOFS = np.nan ln_GOFS = np.nan depth_GOFS = np.nan t_GOFS = ti #%% Map Argo floats lev = np.arange(-9000, 9100, 100) plt.figure() plt.contourf(bath_lonsub, bath_latsub, bath_elevsub, lev, cmap=cmocean.cm.topo) plt.plot(lon_forec_track, lat_forec_track, '.-', color='gold') plt.plot(lon_forec_cone, lat_forec_cone, '.-b', markersize=1) plt.plot(lon_best_track, lat_best_track, 'or', markersize=3) argo_idd = np.unique(argo_ids) for i, id in enumerate(argo_idd): okind = np.where(argo_ids == id)[0] plt.plot(np.unique(argo_lons[okind]), np.unique(argo_lats[okind]), 's', color='darkorange', markersize=5, markeredgecolor='k') plt.title('Argo Floats ' + str(tini)[0:13] + '-' + str(tend)[0:13], fontsize=16) plt.axis('scaled') plt.xlim(lon_lim[0], lon_lim[1]) plt.ylim(lat_lim[0], lat_lim[1]) file = folder_fig + 'ARGO_lat_lon' #file = folder_fig + 'ARGO_lat_lon_' + str(np.unique(argo_times)[0])[0:10] plt.savefig(file, bbox_inches='tight', pad_inches=0.1) #%% Figure argo float vs GOFS and vs RTOFS argo_idd = np.unique(argo_ids) for i, id in enumerate(argo_idd): print(id) okind = np.where(argo_ids == id)[0] argo_time = np.asarray([ datetime.strptime(t, '%Y-%m-%dT%H:%M:%SZ') for t in argo_times[okind] ]) argo_lon = argo_lons[okind] argo_lat = argo_lats[okind] argo_pres = argo_press[okind] argo_temp = argo_temps[okind] argo_salt = argo_salts[okind] # GOFS print('Retrieving variables from GOFS') if isinstance(GOFS_ts, float): temp_GOFS = np.nan salt_GOFS = np.nan else: #oktt_GOFS = np.where(t_GOFS >= argo_time[0])[0][0] ttGOFS = np.asarray([ datetime(t_GOFS[i].year, t_GOFS[i].month, t_GOFS[i].day, t_GOFS[i].hour) for i in np.arange(len(t_GOFS)) ]) tstamp_GOFS = [ mdates.date2num(ttGOFS[i]) for i in np.arange(len(ttGOFS)) ] oktt_GOFS = np.unique( np.round( np.interp(mdates.date2num(argo_time[0]), tstamp_GOFS, np.arange(len(tstamp_GOFS)))).astype(int))[0] oklat_GOFS = np.where(lt_GOFS >= argo_lat[0])[0][0] oklon_GOFS = np.where(ln_GOFS >= argo_lon[0] + 360)[0][0] temp_GOFS = np.asarray(GOFS_ts['water_temp'][oktt_GOFS, :, oklat_GOFS, oklon_GOFS]) salt_GOFS = np.asarray(GOFS_ts['salinity'][oktt_GOFS, :, oklat_GOFS, oklon_GOFS]) # RTOFS #Time window year = int(argo_time[0].year) month = int(argo_time[0].month) day = int(argo_time[0].day) tini = datetime(year, month, day) tend = tini + timedelta(days=1) # Read RTOFS grid and time print('Retrieving coordinates from RTOFS') if tini.month < 10: if tini.day < 10: fol = 'rtofs.' + str(tini.year) + '0' + str( tini.month) + '0' + str(tini.day) else: fol = 'rtofs.' + str(tini.year) + '0' + str(tini.month) + str( tini.day) else: if tini.day < 10: fol = 'rtofs.' + str(tini.year) + str(tini.month) + '0' + str( tini.day) else: fol = 'rtofs.' + str(tini.year) + str(tini.month) + str( tini.day) ncRTOFS = xr.open_dataset(folder_RTOFS + fol + '/' + nc_files_RTOFS[0]) latRTOFS = np.asarray(ncRTOFS.Latitude[:]) lonRTOFS = np.asarray(ncRTOFS.Longitude[:]) depth_RTOFS = np.asarray(ncRTOFS.Depth[:]) tRTOFS = [] for t in np.arange(len(nc_files_RTOFS)): ncRTOFS = xr.open_dataset(folder_RTOFS + fol + '/' + nc_files_RTOFS[t]) tRTOFS.append(np.asarray(ncRTOFS.MT[:])[0]) tRTOFS = np.asarray([mdates.num2date(mdates.date2num(tRTOFS[t])) \ for t in np.arange(len(nc_files_RTOFS))]) oktt_RTOFS = np.where( mdates.date2num(tRTOFS) >= mdates.date2num(argo_time[0]))[0][0] oklat_RTOFS = np.where(latRTOFS[:, 0] >= argo_lat[0])[0][0] oklon_RTOFS = np.where(lonRTOFS[0, :] >= argo_lon[0])[0][0] nc_file = folder_RTOFS + fol + '/' + nc_files_RTOFS[oktt_RTOFS] ncRTOFS = xr.open_dataset(nc_file) #time_RTOFS = tRTOFS[oktt_RTOFS] temp_RTOFS = np.asarray(ncRTOFS.variables['temperature'][0, :, oklat_RTOFS, oklon_RTOFS]) salt_RTOFS = np.asarray(ncRTOFS.variables['salinity'][0, :, oklat_RTOFS, oklon_RTOFS]) #lon_RTOFS = lonRTOFS[0,oklon_RTOFS] #lat_RTOFS = latRTOFS[oklat_RTOFS,0] # Downloading and reading Copernicus output motuc = 'python -m motuclient --motu ' + url_cmems + \ ' --service-id ' + service_id + \ ' --product-id ' + product_id + \ ' --longitude-min ' + str(argo_lon[0]-2/12) + \ ' --longitude-max ' + str(argo_lon[0]+2/12) + \ ' --latitude-min ' + str(argo_lat[0]-2/12) + \ ' --latitude-max ' + str(argo_lat[0]+2/12) + \ ' --date-min ' + '"' + str(tini-timedelta(0.5)) + '"' + \ ' --date-max ' + '"' + str(tend+timedelta(0.5)) + '"' + \ ' --depth-min ' + depth_min + \ ' --depth-max ' + str(np.nanmax(argo_pres)+1000) + \ ' --variable ' + 'thetao' + ' ' + \ ' --variable ' + 'so' + ' ' + \ ' --out-dir ' + out_dir + \ ' --out-name ' + str(id) + '.nc' + ' ' + \ ' --user ' + 'maristizabalvar' + ' ' + \ ' --pwd ' + 'MariaCMEMS2018' os.system(motuc) # Check if file was downloaded COP_file = out_dir + '/' + str(id) + '.nc' # Check if file was downloaded resp = os.system('ls ' + out_dir + '/' + str(id) + '.nc') if resp == 0: COP = xr.open_dataset(COP_file) latCOP = np.asarray(COP.latitude[:]) lonCOP = np.asarray(COP.longitude[:]) depth_COP = np.asarray(COP.depth[:]) tCOP = np.asarray(mdates.num2date(mdates.date2num(COP.time[:]))) else: latCOP = np.empty(1) latCOP[:] = np.nan lonCOP = np.empty(1) lonCOP[:] = np.nan tCOP = np.empty(1) tCOP[:] = np.nan oktimeCOP = np.where( mdates.date2num(tCOP) >= mdates.date2num(tini))[0][0] oklonCOP = np.where(lonCOP >= argo_lon[0])[0][0] oklatCOP = np.where(latCOP >= argo_lat[0])[0][0] temp_COP = np.asarray(COP.variables['thetao'][oktimeCOP, :, oklatCOP, oklonCOP]) salt_COP = np.asarray(COP.variables['so'][oktimeCOP, :, oklatCOP, oklonCOP]) # Figure temp plt.figure(figsize=(5, 6)) plt.plot(argo_temp, -argo_pres, '.-', linewidth=2, label='ARGO Float id ' + str(id)) plt.plot(temp_GOFS, -depth_GOFS, '.-', linewidth=2, label='GOFS 3.1', color='red') plt.plot(temp_RTOFS, -depth_RTOFS, '.-', linewidth=2, label='RTOFS', color='g') plt.plot(temp_COP, -depth_COP, '.-', linewidth=2, label='Copernicus', color='darkorchid') plt.ylim([-1000, 0]) plt.title('Temperature Profile on '+ str(argo_time[0])[0:13] + '\n [lon,lat] = [' \ + str(np.round(argo_lon[0],3)) +',' +\ str(np.round(argo_lat[0],3))+']',\ fontsize=16) plt.ylabel('Depth (m)', fontsize=14) plt.xlabel('$^oC$', fontsize=14) plt.legend(loc='lower right', fontsize=14) file = folder_fig + 'ARGO_vs_GOFS_RTOFS_COP_temp_' + str(id) plt.savefig(file, bbox_inches='tight', pad_inches=0.1) # Figure salt plt.figure(figsize=(5, 6)) plt.plot(argo_salt, -argo_pres, '.-', linewidth=2, label='ARGO Float id ' + str(id)) plt.plot(salt_GOFS, -depth_GOFS, '.-', linewidth=2, label='GOFS 3.1', color='red') plt.plot(salt_RTOFS, -depth_RTOFS, '.-', linewidth=2, label='RTOFS', color='g') plt.plot(salt_COP, -depth_COP, '.-', linewidth=2, label='Copernicus', color='darkorchid') plt.ylim([-1000, 0]) plt.title('Salinity Profile on '+ str(argo_time[0])[0:13] + '\n [lon,lat] = [' \ + str(np.round(argo_lon[0],3)) +',' +\ str(np.round(argo_lat[0],3))+']',\ fontsize=16) plt.ylabel('Depth (m)', fontsize=14) plt.legend(loc='lower right', fontsize=14) file = folder_fig + 'ARGO_vs_GOFS_RTOFS_COP_salt_' + str(id) plt.savefig(file, bbox_inches='tight', pad_inches=0.1)
info_df[info_df['Row Type'] == 'variable'] # Take a look at the variables with standard names: variables = erd.get_var_by_attr(standard_name=lambda v: v is not None) variables # These are the standard variables for the CTDBP instrument - specifically for the CP01CNSM-NSIF-CTDBP. Next, lets query the server for _all_ available data from the CP01CNSM-NSIF-CTDBP. erd.variables = variables erd.get_download_url() # Put it all into a dataframe: data = erd.to_pandas() # + # Plot a basic time-series of the conductivity import matplotlib.pyplot as plt import seaborn as sns sns.set(style="darkgrid") # - data[data['time (UTC)'].isnull()] data['time (UTC)'] = data['time (UTC)'].apply(lambda x: pd.to_datetime(x)) data.set_index(keys='time (UTC)', inplace=True)
def read_glider_data_erddap_Rutgers_server(url_erddap,dataset_id,\ lat_lim,lon_lim,scatter_plot,**kwargs): from erddapy import ERDDAP import matplotlib.pyplot as plt import matplotlib.dates as mdates import cmocean import numpy as np date_ini = kwargs.get('date_ini', None) date_end = kwargs.get('date_end', None) # Find time window of interest if np.logical_or(date_ini == None, date_end == None): constraints = { 'latitude>=': lat_lim[0], 'latitude<=': lat_lim[1], 'longitude>=': lon_lim[0], 'longitude<=': lon_lim[1], } else: constraints = { 'time>=': date_ini, 'time<=': date_end, 'latitude>=': lat_lim[0], 'latitude<=': lat_lim[1], 'longitude>=': lon_lim[0], 'longitude<=': lon_lim[1], } variables = [ 'depth', 'latitude', 'longitude', 'time', 'temperature', 'salinity' ] e = ERDDAP(server=url_erddap, protocol='tabledap', response='nc') e.dataset_id = dataset_id e.constraints = constraints e.variables = variables # Converting glider data to data frame # Cheching that data frame has data df = e.to_pandas() if len(df) != 0: df = e.to_pandas( index_col='time (UTC)', parse_dates=True, skiprows=(1, ) # units information can be dropped. ).dropna() dg = df['depth (m)'].values tg = df.index.values vg1 = df[df.columns[3]].values vg2 = df[df.columns[4]].values upcast = np.where(np.diff(dg) < 0)[0] oku = np.where(np.diff(upcast) > 1)[0] end_upcast = upcast[oku] downcast = np.where(np.diff(dg) > 0)[0] okd = np.where(np.diff(downcast) > 1)[0] end_downcast = downcast[okd] ind = np.hstack( [0, np.unique(np.hstack([end_upcast, end_downcast])), len(dg)]) zn = np.max(np.diff(ind)) depthg = np.empty((zn, len(ind))) depthg[:] = np.nan timeg = np.empty((zn, len(ind))) timeg[:] = np.nan tempg = np.empty((zn, len(ind))) tempg[:] = np.nan saltg = np.empty((zn, len(ind))) saltg[:] = np.nan for i in np.arange(len(ind)): if i == 0: indd = np.argsort(dg[ind[i]:ind[i + 1] + 2]) depthg[0:len(dg[ind[i]:ind[i + 1] + 2]), i] = dg[ind[i]:ind[i + 1] + 2][indd] timeg[0:len(dg[ind[i]:ind[i + 1] + 2]), i] = mdates.date2num(tg[ind[i]:ind[i + 1] + 2][indd]) tempg[0:len(vg1[ind[i]:ind[i + 1] + 2]), i] = vg1[ind[i]:ind[i + 1] + 2][indd] saltg[0:len(vg2[ind[i]:ind[i + 1] + 2]), i] = vg2[ind[i]:ind[i + 1] + 2][indd] if i < len(ind) - 1: indd = np.argsort(dg[ind[i] + 1:ind[i + 1] + 2]) depthg[0:len(dg[ind[i] + 1:ind[i + 1] + 2]), i] = dg[ind[i] + 1:ind[i + 1] + 2][indd] timeg[0:len(dg[ind[i] + 1:ind[i + 1] + 2]), i] = mdates.date2num(tg[ind[i] + 1:ind[i + 1] + 2][indd]) tempg[0:len(vg1[ind[i] + 1:ind[i + 1] + 2]), i] = vg1[ind[i] + 1:ind[i + 1] + 2][indd] saltg[0:len(vg2[ind[i] + 1:ind[i + 1] + 2]), i] = vg2[ind[i] + 1:ind[i + 1] + 2][indd] else: indd = np.argsort(dg[ind[i] + 1:len(dg)]) depthg[0:len(dg[ind[i] + 1:len(dg)]), i] = dg[ind[i] + 1:len(dg)][indd] timeg[0:len(dg[ind[i] + 1:len(dg)]), i] = mdates.date2num(tg[ind[i] + 1:len(dg)][indd]) tempg[0:len(vg1[ind[i] + 1:len(vg1)]), i] = vg1[ind[i] + 1:len(vg1)][indd] saltg[0:len(vg2[ind[i] + 1:len(vg2)]), i] = vg2[ind[i] + 1:len(vg2)][indd] # Scatter plot if scatter_plot == 'yes': color_map = cmocean.cm.thermal varg = tempg #timeg_matrix = np.tile(timeg.T,(depthg.shape[0],1)) ttg = np.ravel(timeg) dg = np.ravel(depthg) teg = np.ravel(varg) kw = dict(c=teg, marker='*', edgecolor='none') fig, ax = plt.subplots(figsize=(10, 3)) cs = ax.scatter(ttg, -dg, cmap=color_map, **kw) #fig.colorbar(cs) ax.set_xlim(np.nanmin(ttg), np.nanmax(ttg)) ax.set_ylabel('Depth (m)', fontsize=14) cbar = plt.colorbar(cs) cbar.ax.set_ylabel('Temperature ($^oC$)', fontsize=14) ax.set_title(dataset_id, fontsize=16) xfmt = mdates.DateFormatter('%H:%Mh\n%d-%b') ax.xaxis.set_major_formatter(xfmt) plt.ylim([-np.nanmax(dg), 0]) color_map = cmocean.cm.haline varg = saltg #timeg_matrix = np.tile(timeg.T,(depthg.shape[0],1)) ttg = np.ravel(timeg) dg = np.ravel(depthg) teg = np.ravel(varg) kw = dict(c=teg, marker='*', edgecolor='none') fig, ax = plt.subplots(figsize=(10, 3)) cs = ax.scatter(ttg, -dg, cmap=color_map, **kw) #fig.colorbar(cs) ax.set_xlim(np.nanmin(ttg), np.nanmax(ttg)) ax.set_ylabel('Depth (m)', fontsize=14) cbar = plt.colorbar(cs) cbar.ax.set_ylabel('Salinity', fontsize=14) ax.set_title(dataset_id, fontsize=16) xfmt = mdates.DateFormatter('%H:%Mh\n%d-%b') ax.xaxis.set_major_formatter(xfmt) plt.ylim([-np.nanmax(dg), 0]) return tempg, saltg, timeg, latg, long, depthg
def read_glider_data_erddap_server(url_erddap,dataset_id,\ lat_lim,lon_lim,scatter_plot,**kwargs): """ Created on Tue Feb 5 10:05:37 2019 @author: aristizabal This function reads glider data from the IOOS Data Assembly Center (DAC). Inputs: url_erddap: url address of thredds server Example: 'https://data.ioos.us/gliders/erddap' dataset_id: this id is retrieved from the glider DAC using the function "retrieve_glider_id_erddap_server". Example: 'ru30-20180705T1825' lat_lim: latitude limits for the search. Example, lat_lim = [38.0,40.0] lon_lim: longitude limits for the search. Example, lon_lim = [-75.0,-72.0] date_ini: initial date of time window. This function accepts the data formats '%Y-%m-%d T %H:%M:%S Z' and '%Y/%m/%d/%H'. Examaple: date_ini = '2018-08-02T00:00:00Z' or '2018/08/02/00' date_end: initial date of time window. This function uses the data format '%Y-%m-%d T %H:%M:%S Z'. Examaple: date_ini = '2018-08-10T00:00:00Z' and '2018/08/10/00' scatter_plot: if equal to 'yes' then a scatter plot of the glider transect is plotted Outputs: tempg: all the glider profiles of temperature within the user defined time window saltg: all the glider profiles of salinity within the user defined time window latg: latitude within the user defined time window long: longitude within the user defined time window timeg: user defined time window depthg: depth vector for all profiles """ from erddapy import ERDDAP import matplotlib.pyplot as plt import matplotlib.dates as mdates import cmocean import numpy as np date_ini = kwargs.get('date_ini', None) date_end = kwargs.get('date_end', None) # Find time window of interest if np.logical_or(date_ini == None, date_end == None): constraints = { 'latitude>=': lat_lim[0], 'latitude<=': lat_lim[1], 'longitude>=': lon_lim[0], 'longitude<=': lon_lim[1], } else: constraints = { 'time>=': date_ini, 'time<=': date_end, 'latitude>=': lat_lim[0], 'latitude<=': lat_lim[1], 'longitude>=': lon_lim[0], 'longitude<=': lon_lim[1], } variables = [ 'depth', 'latitude', 'longitude', 'time', 'temperature', 'salinity' ] e = ERDDAP(server=url_erddap, protocol='tabledap', response='nc') e.dataset_id = dataset_id e.constraints = constraints e.variables = variables # Converting glider data to data frame # Cheching that data frame has data df = e.to_pandas() if len(df) > 3: df = e.to_pandas( index_col='time (UTC)', parse_dates=True, skiprows=(1, ) # units information can be dropped. ).dropna() # Coverting glider vectors into arrays timeg, ind = np.unique(df.index.values, return_index=True) latg = df['latitude (degrees_north)'].values[ind] long = df['longitude (degrees_east)'].values[ind] dg = df['depth (m)'].values vg1 = df[df.columns[3]].values vg2 = df[df.columns[4]].values zn = np.int(np.max(np.diff(np.hstack([ind, len(dg)])))) depthg = np.empty((zn, len(timeg))) depthg[:] = np.nan tempg = np.empty((zn, len(timeg))) tempg[:] = np.nan saltg = np.empty((zn, len(timeg))) saltg[:] = np.nan for i, ii in enumerate(ind): if i < len(timeg) - 1: depthg[0:len(dg[ind[i]:ind[i + 1]]), i] = dg[ind[i]:ind[i + 1]] tempg[0:len(vg1[ind[i]:ind[i + 1]]), i] = vg1[ind[i]:ind[i + 1]] saltg[0:len(vg2[ind[i]:ind[i + 1]]), i] = vg2[ind[i]:ind[i + 1]] else: depthg[0:len(dg[ind[i]:len(dg)]), i] = dg[ind[i]:len(dg)] tempg[0:len(vg1[ind[i]:len(vg1)]), i] = vg1[ind[i]:len(vg1)] saltg[0:len(vg2[ind[i]:len(vg2)]), i] = vg2[ind[i]:len(vg2)] # Scatter plot if scatter_plot == 'yes': color_map = cmocean.cm.thermal varg = tempg timeg_matrix = np.tile(timeg.T, (depthg.shape[0], 1)) ttg = np.ravel(timeg_matrix) dg = np.ravel(depthg) teg = np.ravel(varg) kw = dict(c=teg, marker='*', edgecolor='none') fig, ax = plt.subplots(figsize=(10, 3)) cs = ax.scatter(ttg, -dg, cmap=color_map, **kw) #fig.colorbar(cs) ax.set_xlim(timeg[0], timeg[-1]) ax.set_ylabel('Depth (m)', fontsize=14) cbar = plt.colorbar(cs) cbar.ax.set_ylabel('Temperature ($^oC$)', fontsize=14) ax.set_title(dataset_id, fontsize=16) xfmt = mdates.DateFormatter('%H:%Mh\n%d-%b') ax.xaxis.set_major_formatter(xfmt) plt.ylim([-np.nanmax(dg), 0]) color_map = cmocean.cm.haline varg = saltg timeg_matrix = np.tile(timeg.T, (depthg.shape[0], 1)) ttg = np.ravel(timeg_matrix) dg = np.ravel(depthg) teg = np.ravel(varg) kw = dict(c=teg, marker='*', edgecolor='none') fig, ax = plt.subplots(figsize=(10, 3)) cs = ax.scatter(ttg, -dg, cmap=color_map, **kw) #fig.colorbar(cs) ax.set_xlim(timeg[0], timeg[-1]) ax.set_ylabel('Depth (m)', fontsize=14) cbar = plt.colorbar(cs) cbar.ax.set_ylabel('Salinity', fontsize=14) ax.set_title(dataset_id, fontsize=16) xfmt = mdates.DateFormatter('%H:%Mh\n%d-%b') ax.xaxis.set_major_formatter(xfmt) plt.ylim([-np.nanmax(dg), 0]) else: tempg = np.nan saltg = np.nan timeg = np.nan latg = np.nan long = np.nan depthg = np.nan return tempg, saltg, timeg, latg, long, depthg
def grid_glider( dataset_id, varz2d=[ 'potential_temperature', 'salinity', 'cdom', 'chlorophyll_a', 'beta_700nm' ], zgrid=np.arange(0, 1000, 5), ): '''grid the glider data from RUCOOL Erddap. this needs work''' import xarray as xr import pandas as pd from erddapy import ERDDAP from scipy.signal import find_peaks from scipy import stats e = ERDDAP( server="http://slocum-data.marine.rutgers.edu/erddap", protocol="tabledap", response="nc", ) # get the science data: e.dataset_id = dataset_id # this connects to the data and load into an pandas dataframe ds = e.to_pandas() # remove the spaces from the column names ds.columns = ds.columns.str.split(' ').str[0] # get the time to be a datetime object ds['time'] = pd.to_datetime(ds['time']) # put the times in order ds = ds.sort_values(by=['time']) # fill nans in dpeth for the profile breakup interpd = ds.depth.interpolate() # find the top and bottom of each profile apogee, prop = find_peaks(interpd.values, threshold=None, distance=None, prominence=50) perogee, prop = find_peaks(-1 * interpd.values, threshold=None, distance=None, prominence=50) # stack the index of the turning points into one vector turns = np.sort(np.append(apogee, perogee)) # this is your depth grid, you can set: zgrd = zgrid # list of variables to grid in 2d: # you choose from the columns of the science data dataz = varz2d # this is a dict to hold our gridded stuff # until we make a dataset later d2 = {} # loop on the variables you want to bin for varz in dataz: values = ds[varz] # grab some data #this thing below bins the data ret = stats.binned_statistic_2d(ds.index.values, ds.depth, values, statistic='mean', bins=[turns, zgrd]) d2[varz] = ret.statistic.T # things to bin in the x direction oneDvars = ['latitude', 'longitude', 'time', 'u', 'v'] # NB: u, v only have one value per dive sequence, so only half the number profiles! # actually, its weirder than that... not sure there are more than half... # dict to hold our 1d bins d1 = {} # loop on 1d stuff: for thing in oneDvars: if thing == 'time': bin_means, bin_edges, binnumber = stats.binned_statistic( ds.index.values, ds[thing].astype(int), statistic='mean', bins=turns) bin_means = pd.to_datetime(bin_means) else: bin_means, bin_edges, binnumber = stats.binned_statistic( ds.index.values, ds[thing].values, statistic=np.nanmean, bins=turns) d1[thing] = bin_means # need the depth grid centers zgrd_ctr = zgrd[:-1] + np.diff(zgrd).mean() / 2 # create the dataset ds_gridded = xr.Dataset(coords={ 'date': d1['time'].values, 'depth': zgrd_ctr, 'lat': ('date', d1['latitude']), 'lon': ('date', d1['longitude']) }, data_vars={ 'u': ('date', d1['u']), 'v': ('date', d1['v']) }) # add the other data for varz in dataz: ds_gridded[varz] = (('depth', 'date'), d2[varz]) return ds_gridded
class ErddapReader(Reader): """ This class searches ERDDAP servers. There are 2 known_servers but others can be input too. Attributes ---------- parallel: boolean If True, run with simple parallelization using `multiprocessing`. If False, run serially. known_server: string Two ERDDAP servers are built in to be known to this reader: "ioos" and "coastwatch". e: ERDDAP server instance e.protocol: string * "tabledap" (pandas, appropriate for reading as csv) * "griddap" (xarray, appropriate for reading as netcdf) e.server: string Return the server name columns: list Metadata columns name: string "erddap_ioos", "erddap_coastwatch", or a constructed string if the user inputs a new protocol and server. reader: string reader is defined as "ErddapReader". """ def __init__(self, known_server="ioos", protocol=None, server=None, parallel=True): """ Parameters ---------- known_server: string, optional Two ERDDAP servers are built in to be known to this reader: "ioos" and "coastwatch". protocol, server: string, optional For a user-defined ERDDAP server, input the protocol as one of the following: * "tabledap" (pandas, appropriate for reading as csv) * "griddap" (xarray, appropriate for reading as netcdf) and the server address (such as "http://erddap.sensors.ioos.us/erddap" or "http://coastwatch.pfeg.noaa.gov/erddap"). parallel: boolean If True, run with simple parallelization using `multiprocessing`. If False, run serially. """ self.parallel = parallel # hard wire this for now filetype = "netcdf" # either select a known server or input protocol and server string if known_server == "ioos": protocol = "tabledap" server = "http://erddap.sensors.ioos.us/erddap" filetype = "netcdf" # other option: "csv" elif known_server == "coastwatch": protocol = "griddap" server = "http://coastwatch.pfeg.noaa.gov/erddap" filetype = "netcdf" # other option: "csv" elif known_server is not None: statement = ( "either select a known server or input protocol and server string" ) assert (protocol is not None) & (server is not None), statement else: known_server = urllib.parse.urlparse(server).netloc # known_server = server.strip("/erddap").strip("http://").replace(".", "_") statement = ( "either select a known server or input protocol and server string" ) assert (protocol is not None) & (server is not None), statement self.known_server = known_server self.e = ERDDAP(server=server) self.e.protocol = protocol self.e.server = server self.filetype = filetype # columns for metadata self.columns = [ "geospatial_lat_min", "geospatial_lat_max", "geospatial_lon_min", "geospatial_lon_max", "time_coverage_start", "time_coverage_end", "defaultDataQuery", "subsetVariables", # first works for timeseries sensors, 2nd for gliders "keywords", # for hf radar "id", "infoUrl", "institution", "featureType", "source", "sourceUrl", ] # name self.name = f"erddap_{known_server}" self.reader = "ErddapReader" self.store = dict() def __getitem__(self, key): """Redefinition of dict-like behavior. This enables user to use syntax `reader[dataset_id]` to read in and save dataset into the object. Parameters ---------- key: str dataset_id for a dataset that is available in the search/reader object. Returns ------- xarray Dataset of the data associated with key """ returned_data = self.data_by_dataset(key) # returned_data = self._return_data(key) self.__setitem__(key, returned_data) return returned_data def find_dataset_id_from_station(self, station): """Find dataset_id from station name. Parameters ---------- station: string Station name for which to search for dataset_id """ if station is None: return None # for station in self._stations: # if station has more than one word, AND will be put between # to search for multiple terms together. url = self.e.get_search_url(response="csv", items_per_page=5, search_for=station) try: df = pd.read_csv(url) except Exception as e: logger.exception(e) logger.warning( f"search url {url} did not work for station {station}.") return # first try for exact station match try: # Special case for TABS when don't split the id name if "tabs" in station: # don't split dataset_id = [ dataset_id for dataset_id in df["Dataset ID"] if station.lower() == dataset_id.lower() ][0] else: # first try as dataset_id then do as station name dataset_id = [ dataset_id for dataset_id in df["Dataset ID"] if station.lower() in [dataset_id.lower()] + dataset_id.lower().split("_") ][0] except Exception as e: logger.exception(e) logger.warning( "When searching for a dataset id to match station name %s, the first attempt to match the id did not work." % (station)) # If that doesn't work, return None for dataset_id dataset_id = None # # if that doesn't work, trying for more general match and just take first returned option # dataset_id = df.iloc[0]["Dataset ID"] return dataset_id @property def dataset_ids(self): """Find dataset_ids for server. Notes ----- The dataset_ids are found by querying the metadata through the ERDDAP server. The number of dataset_ids can change if a variable is removed from the list of variables and this is rerun. """ if not hasattr(self, "_dataset_ids") or ( self.variables and (len(self.variables) != self.num_variables)): # This should be a region search if self.approach == "region": # find all the dataset ids which we will use to get the data # This limits the search to our keyword arguments in kw which should # have min/max lon/lat/time values dataset_ids = [] if self.variables is not None: for variable in self.variables: # find and save all dataset_ids associated with variable search_url = self.e.get_search_url( response="csv", **self.kw, variableName=variable, items_per_page=10000, ) try: search = pd.read_csv(search_url) dataset_ids.extend(search["Dataset ID"]) except Exception as e: logger.exception(e) logger.warning( f"variable {variable} was not found in the search" ) logger.warning(f"search_url: {search_url}") else: # find and save all dataset_ids associated with variable search_url = self.e.get_search_url(response="csv", **self.kw, items_per_page=10000) try: search = pd.read_csv(search_url) dataset_ids.extend(search["Dataset ID"]) except Exception as e: logger.exception(e) logger.warning("nothing found in the search") logger.warning(f"search_url: {search_url}") # only need a dataset id once since we will check them each for all standard_names self._dataset_ids = list(set(dataset_ids)) # This should be a search for the station names elif self.approach == "stations": # search by station name for each of stations if self.parallel: # get metadata for datasets # run in parallel to save time num_cores = multiprocessing.cpu_count() dataset_ids = Parallel(n_jobs=num_cores)( delayed(self.find_dataset_id_from_station)(station) for station in self._stations) else: dataset_ids = [] for station in self._stations: dataset_ids.append( self.find_dataset_id_from_station(station)) # remove None from list dataset_ids = [i for i in dataset_ids if i] # In this case return all dataset_ids so they match 1-1 with # the input station list. self._dataset_ids = dataset_ids else: logger.warning( "Neither stations nor region approach were used in function dataset_ids." ) # update number of variables if self.variables: self.num_variables = len(self.variables) return self._dataset_ids def meta_by_dataset(self, dataset_id): """Return the catalog metadata for a single dataset_id.""" info_url = self.e.get_info_url(response="csv", dataset_id=dataset_id) try: info = pd.read_csv(info_url) except Exception as e: logger.exception(e) logger.warning(f"Could not read info from {info_url}") return {dataset_id: []} items = [] for col in self.columns: try: item = info[info["Attribute Name"] == col]["Value"].values[0] dtype = info[info["Attribute Name"] == col]["Data Type"].values[0] except: if col == "featureType": # this column is not present in HF Radar metadata but want it to # map to data_type, so input 'grid' in that case. item = "grid" else: item = "NA" if dtype == "String": pass elif dtype == "double": item = float(item) elif dtype == "int": item = int(item) items.append(item) # include download link ## self.e.dataset_id = dataset_id if self.e.protocol == "tabledap": # set the same time restraints as before self.e.constraints = { "time<=": self.kw["max_time"], "time>=": self.kw["min_time"], } if self.filetype == "csv": download_url = self.e.get_download_url(response="csvp") elif self.filetype == "netcdf": download_url = self.e.get_download_url(response="ncCf") elif self.e.protocol == "griddap": # the search terms that can be input for tabledap do not work for griddap # in erddapy currently. Instead, put together an opendap link and then # narrow the dataset with xarray. # get opendap link download_url = self.e.get_download_url(response="opendap") # check if "prediction" is present in metadata, esp in case of NOAA # model predictions is_prediction = "Prediction" in " ".join( list(info["Value"].replace(np.nan, None).values)) # add erddap server name return { dataset_id: [self.e.server, download_url, info_url, is_prediction] + items + [self.variables] } @property def meta(self): """Rearrange the individual metadata into a dataframe. Notes ----- This should exclude duplicate entries. """ if not hasattr(self, "_meta"): if self.parallel: # get metadata for datasets # run in parallel to save time num_cores = multiprocessing.cpu_count() downloads = Parallel(n_jobs=num_cores)( delayed(self.meta_by_dataset)(dataset_id) for dataset_id in self.dataset_ids) else: downloads = [] for dataset_id in self.dataset_ids: downloads.append(self.meta_by_dataset(dataset_id)) # make dict from individual dicts from collections import ChainMap meta = dict(ChainMap(*downloads)) # Make dataframe of metadata # variable names are the column names for the dataframe self._meta = pd.DataFrame.from_dict( meta, orient="index", columns=[ "database", "download_url", "info_url", "is_prediction" ] + self.columns + ["variable names"], ) return self._meta def data_by_dataset(self, dataset_id): """Return the data for a single dataset_id. Returns ------- A tuple of (dataset_id, data), where data type is a pandas DataFrame Notes ----- Data is read into memory. """ if self.filetype == "csv": # if self.e.protocol == "tabledap": try: # fetch metadata if not already present # found download_url from metadata and use self.e.dataset_id = dataset_id # dataset_vars gives a list of the variables in the dataset dataset_vars = (self.meta.loc[dataset_id] ["defaultDataQuery"].split("&")[0].split(",")) # vars_present gives the variables in self.variables # that are actually in the dataset vars_present = [] for selfvariable in self.variables: vp = [var for var in dataset_vars if var == selfvariable] if len(vp) > 0: vars_present.append(vp[0]) # If any variables are not present, this doesn't work. if self.variables is not None: self.e.variables = [ "time", "longitude", "latitude", "station", ] + vars_present dd = self.e.to_pandas(response="csvp", index_col=0, parse_dates=True) # dd = self.e.to_pandas(response='csv', header=[0, 1], # index_col=0, parse_dates=True) # dd = pd.read_csv( # download_url, header=[0, 1], index_col=0, parse_dates=True # ) # Drop cols and rows that are only NaNs. dd = dd.dropna(axis="index", how="all").dropna(axis="columns", how="all") if self.variables is not None: # check to see if there is any actual data # this is a bit convoluted because the column names are the variable names # plus units so can't match 1 to 1. datacols = ( 0 # number of columns that represent data instead of metadata ) for col in dd.columns: datacols += [ varname in col for varname in self.variables ].count(True) # if no datacols, we can skip this one. if datacols == 0: dd = None except Exception as e: logger.exception(e) logger.warning("no data to be read in for %s" % dataset_id) dd = None elif self.filetype == "netcdf": # elif self.e.protocol == "griddap": if self.e.protocol == "tabledap": try: # assume I don't need to narrow in space since time series (tabledap) self.e.dataset_id = dataset_id dd = self.e.to_xarray() # dd = xr.open_dataset(download_url, chunks="auto") dd = dd.swap_dims({"obs": dd.cf["time"].name}) dd = dd.sortby(dd.cf["time"], ascending=True) dd = dd.cf.sel( T=slice(self.kw["min_time"], self.kw["max_time"])) # dd = dd.set_coords( # [dd.cf["longitude"].name, dd.cf["latitude"].name] # ) # use variable names to drop other variables (should. Ido this?) if self.variables is not None: # I don't think this is true with new approach # # ERDDAP prepends variables with 's.' in netcdf files, # # so include those with variables # erd_vars = [f's.{var}' for var in self.variables] # var_list = set(dd.data_vars) - (set(self.variables) | set(erd_vars)) var_list = set(dd.data_vars) - set(self.variables) dd = dd.drop_vars(var_list) # the lon/lat are on the 'timeseries' singleton dimension # but the data_var variable was not, which messed up # cf-xarray. When longitude and latitude are not on a # dimension shared with a variable, the variable can't be # called with cf-xarray. e.g. dd.cf['ssh'] won't work. if "timeseries" in dd.dims: for data_var in dd.data_vars: if "timeseries" not in dd[data_var].dims: dd[data_var] = dd[data_var].expand_dims( dim="timeseries", axis=1) except Exception as e: logger.exception(e) logger.warning("no data to be read in for %s" % dataset_id) dd = None elif self.e.protocol == "griddap": try: # this makes it read in the whole file which might be large self.e.dataset_id = dataset_id # dd = self.e.to_xarray(chunks="auto").sel( # time=slice(self.kw["min_time"], self.kw["max_time"]) # ) download_url = self.e.get_download_url(response="opendap") dd = xr.open_dataset(download_url, chunks="auto").sel( time=slice(self.kw["min_time"], self.kw["max_time"])) if ("min_lat" in self.kw) and ("max_lat" in self.kw): dd = dd.sel(latitude=slice(self.kw["min_lat"], self.kw["max_lat"])) if ("min_lon" in self.kw) and ("max_lon" in self.kw): dd = dd.sel(longitude=slice(self.kw["min_lon"], self.kw["max_lon"])) # use variable names to drop other variables (should. Ido this?) if self.variables is not None: vars_list = set(dd.data_vars) - set(self.variables) dd = dd.drop_vars(vars_list) except Exception as e: logger.exception(e) logger.warning("no data to be read in for %s" % dataset_id) dd = None # return (dataset_id, dd) return dd # @property def data(self, dataset_ids=None): """Read in data for some or all dataset_ids. NOT USED CURRENTLY Once data is read in for a dataset_ids, it is remembered. See full documentation in `utils.load_data()`. """ output = odg.utils.load_data(self, dataset_ids) return output
def load_data_from_erddap(config, station_id=None, station_data=None): mcf_template = yaml.load(open(config['static_data']['mcf_template'], 'r'), Loader=yaml.FullLoader) es = ERDDAP( server=config['dynamic_data']['erddap_server'], protocol=config['dynamic_data']['erddap_protocol'], ) if station_id is None: #load all station data MCF skeleton stations = {} es.dataset_id = 'allDatasets' # filter out "log in" datasets as the vast majoirty of their available metadata is unavailable es.constraints = {'accessible=': 'public'} stations_df = es.to_pandas() # drop 'allDatasets' row stations_df.drop(labels=0, axis='index', inplace=True) print(stations_df) for index_label, row_series in stations_df.iterrows(): id = row_series['datasetID'] # ensure each station has an independant copy of the MCF skeleton stations[id] = copy.deepcopy(mcf_template) dataset_url = row_series['tabledap'] if row_series[ 'dataStructure'] == 'table' else row_series['griddap'] stations[id]['metadata']['identifier'] = id stations[id]['metadata']['dataseturi'] = dataset_url stations[id]['spatial']['datatype'] = 'textTable' if row_series[ 'dataStructure'] == 'table' else 'grid' stations[id]['spatial']['geomtype'] = row_series['cdm_data_type'] stations[id]['spatial']['bbox'] = '%s,%s,%s,%s' % ( row_series['minLongitude (degrees_east)'], row_series['minLatitude (degrees_north)'], row_series['maxLongitude (degrees_east)'], row_series['maxLatitude (degrees_north)']) stations[id]['identification']['title'] = row_series['title'] stations[id]['identification']['dates']['creation'] = row_series[ 'minTime (UTC)'] stations[id]['identification']['temporal_begin'] = row_series[ 'minTime (UTC)'] stations[id]['identification']['temporal_end'] = row_series[ 'maxTime (UTC)'] stations[id]['identification']['url'] = dataset_url stations[id]['identification']['abstract'] = row_series['summary'] stations[id]['distribution']['erddap']['url'] = dataset_url stations[id]['distribution']['erddap']['name'] = row_series[ 'title'] print('Stations after ERDDAP call...') print(stations) return_value = stations pass else: #load specific station data into MCF skeleton print('Loading ERDDAP metadata for station: %s' % (station_id)) es.dataset_id = station_id metadata_url = es.get_download_url(dataset_id='%s/index' % (station_id), response='csv', protocol='info') metadata = pd.read_csv(filepath_or_buffer=metadata_url) print(metadata_url) print(metadata.head()) # ERDDAP ISO XML provides a list of dataset field names (long & short), data types & units # of measurement, in case this becomes useful for the CIOOS metadata standard we can extend # the YAML skeleton to include these and the template to export them. # # below most varible attributes from ERDDAP are extracted and pivoted to describe the field # actual field data types are extracted seperately and merged into the pivoted dataframe # for completeness columns_pivot = metadata[(metadata['Variable Name'] != 'NC_GLOBAL') & (metadata['Row Type'] != 'variable')].pivot( index='Variable Name', columns='Attribute Name', values='Value') col_data_types = metadata[(metadata['Row Type'] == 'variable')][[ 'Variable Name', 'Data Type' ]] df_merge = pd.merge(columns_pivot, col_data_types, on='Variable Name') station_data['dataset'] = {} for index_label, field_series in df_merge.iterrows(): field_name = field_series['Variable Name'] station_data['dataset'][field_name] = {} station_data['dataset'][field_name]['long_name'] = field_series[ 'long_name'] station_data['dataset'][field_name]['data_type'] = field_series[ 'Data Type'] station_data['dataset'][field_name]['units'] = field_series[ 'units'] station_data['identification']['keywords']['default'][ 'keywords'] = metadata[ (metadata['Variable Name'] == 'NC_GLOBAL') & (metadata['Attribute Name'] == 'keywords')]['Value'].values return_value = station_data return return_value
ncbath = xr.open_dataset(bath_file) bath_lat = ncbath.variables['lat'][:] bath_lon = ncbath.variables['lon'][:] bath_elev = ncbath.variables['elevation'][:] #%% Looping through all gliders found for id in gliders: print('Reading ' + id ) e.dataset_id = id e.constraints = constraints e.variables = variables # chacking data frame is not empty df = e.to_pandas() if len(df.index) != 0 : # Converting glider data to data frame df = e.to_pandas( index_col='time (UTC)', parse_dates=True, skiprows=(1,) # units information can be dropped. ).dropna() # Coverting glider vectors into arrays timeg, ind = np.unique(df.index.values,return_index=True) latg = df['latitude (degrees_north)'].values[ind] long = df['longitude (degrees_east)'].values[ind] dg = df['depth (m)'].values
'sea_water_velocity_to_direction', 'sea_water_speed' ] e = ERDDAP( server=url_buoy, protocol='tabledap', response='nc' ) e.dataset_id = datasets[0] e.constraints = constraints e.variables = variables df_vel = e.to_pandas( index_col='time (UTC)', parse_dates=True, ) time_vel, ind = np.unique(df_vel.index,return_index=True) depth_vel = df_vel['depth (m)'].values water_speed = df_vel['sea_water_speed (cm/s)'].values # Reshape velocity and depth into array depth x time zn = ind[1] # 34 vertical levels depth_levels = depth_vel[0:zn] water_speed_matrix = np.empty((zn,len(time_vel))) water_speed_matrix[:] = np.nan for i,ii in enumerate(ind): if i < len(time_vel)-1: water_speed_matrix[0:len(water_speed[ind[i]:ind[i+1]]),i] = water_speed[ind[i]:ind[i+1]]