def query(url, **kw): df = pd.DataFrame() # we need to rstrip to prevent a '//' in the URL for some reason: url = url.rstrip("/") e = ERDDAP(server=url, protocol='tabledap', response='csv') # submit the query: try: # this is redundant to ERDDAPY API query below: #r = requests.get(e.get_search_url(**kw), headers=headers) #r.raise_for_status() print("Testing ERDDAP {}".format(url)) df = pd.read_csv("{}".format(e.get_search_url(**kw), headers=headers)) print("ERDDAP {} returned results from URL: {}".format( url, e.get_search_url(**kw))) df['server'] = url df.dropna(subset=['tabledap'], inplace=True) return df[[ 'server', 'Dataset ID', 'tabledap', 'Institution', 'Summary' ]] except Exception as ex: # can happen if the dataset does not have any features within the query window, just log it here: if type(ex).__name__ in ["HTTPError"]: print(ex) #raise pass return None
class DatasetList: """Search servers for glider dataset ids. Defaults to the string "glider" Attributes: e: an ERDDAP server instance search_terms: A list of terms to search the server for. Multiple terms will be combined as AND """ def __init__(self, server=_server): self.e = ERDDAP( server=server, protocol="tabledap", ) @functools.lru_cache(maxsize=None) def _get_ids(self, search_terms): """Thin wrapper where inputs can be hashed for lru_cache.""" dataset_ids = pd.Series(dtype=str) for term in search_terms: url = self.e.get_search_url(search_for=term, response="csv") dataset_ids = dataset_ids.append(pd.read_csv(url)["Dataset ID"], ignore_index=True) self.dataset_ids = dataset_ids.str.split(";", expand=True).stack().unique() return self.dataset_ids def get_ids(self, search_terms=["glider"]): """Search the database using a user supplied list of comma separated strings :return: Unique list of dataset ids """ search_terms = tuple(search_terms) return self._get_ids(search_terms)
def retrieve_dataset_id_erddap_server(url_erddap, lat_lim, lon_lim, date_ini, date_end): """ Created on Tue Feb 5 10:05:37 2019 @author: aristizabal This function retrieves glider ids from the IOOS Data Assembly Center (DAC). Inputs: url_erddap: url address of erddap server Example: 'https://data.ioos.us/gliders/erddap' lat_lim: latitude limits for the search. Example, lat_lim = [38.0,40.0] lon_lim: longitude limits for the search. Example, lon_lim = [-75.0,-72.0] date_ini: initial date of time window. This function accepts the data formats '%Y-%m-%d T %H:%M:%S Z' and '%Y/%m/%d/%H'. Examaple: date_ini = '2018-08-02T00:00:00Z' date_end: initial date of time window. This function accepts the data formats '%Y-%m-%d T %H:%M:%S Z' and '%Y/%m/%d/%H'. Examaple: date_ini = '2018-08-10T00:00:00Z' Outputs: gliders: list of gliders ids that fall within the lat, lon and time constraints """ from erddapy import ERDDAP import pandas as pd e = ERDDAP(server=url_erddap) # Search constraints kw = { 'min_lon': lon_lim[0], 'max_lon': lon_lim[1], 'min_lat': lat_lim[0], 'max_lat': lat_lim[1], 'min_time': date_ini, 'max_time': date_end, } search_url = e.get_search_url(response='csv', **kw) search = pd.read_csv(search_url) # Extract the IDs gliders = search['Dataset ID'].values return gliders
def return_glider_ids(kwargs): """ Searches an ERDDAP server for datasets and returns dataset IDs :param kwargs: dictionary containing coordinate and time limits :return: array containing dataset IDs """ e = ERDDAP(server=ioos_url) search_url = e.get_search_url(response='csv', **kwargs) try: search = pd.read_csv(search_url) ds_ids = search['Dataset ID'].values except: ds_ids = np.array([]) return ds_ids
If we change the response to `html` we can visualize the page. def show_iframe(src): from IPython.display import HTML iframe = '<iframe src="{src}" width="100%" height="950"></iframe>'.format return HTML(iframe(src=src)) show_iframe(e.get_download_url(response="html")) Additionally, the object has `.get_info_url()` and `.get_search_url()` that can be used to obtain the info and search URLs respectively show_iframe(e.get_info_url(response="html")) show_iframe(e.get_search_url(response="html")) `erddapy` also brings some simple methods to download the data in some common data formats, like `pandas.DataFrame` and `xarray.Dataset`. df = e.to_pandas(index_col="time (UTC)", parse_dates=True,).dropna() df.head() ds = e.to_xarray(decode_times=False) ds["temperature"] Here is a simple plot using the data from `xarray`. %matplotlib inline
def list_data(self, verbose=False): e = ERDDAP(server=self.server_url) self.df = pd.read_csv(e.get_search_url(response='csv', search_for=self.glider_id)) if verbose: print(self.df['Dataset ID'])
class ErddapPlotter(object): def __init__(self, erddap_url, protocol='tabledap', response='png'): self._img_types = [ 'smallPdf', 'pdf', 'largePdf', 'smallPng', 'png', 'largePng', 'transparentPng' ] self._default_plot_parameters = { '.bgColor=': '0xFFFFFF', '.color=': '0x000000', '.colorBar=': 'Rainbow2|C|Linear|||', '.draw=': 'markers', '.legend=': 'Bottom', '.marker=': '6|5', '.xRange=': '||true|Linear', '.yRange=': '||false|Linear' } if response not in self._img_types: raise ValueError( 'Invalid image response type specified: {:}'.format(response)) self._erddap_url = erddap_url self._protocol = protocol self._response = response self._plot_query = '' self._constraints_query = '' self._image_url = '' self._last_request = '' self._logger = logging.getLogger(os.path.basename(__file__)) self._e = ERDDAP(self._erddap_url, protocol=self._protocol, response=self._response) self._datasets = pd.DataFrame([]) self.fetch_erddap_datasets() self._constraints = {} self._plot_parameters = self._default_plot_parameters.copy() # self._line_style = {} # self._marker_style = {} # self._marker_color = {} # self._colorbar = {} # self._y_range = {} # self._x_range = {} # self._bg_color = {} # self._legend = {} # self._zoom = {} # self.set_line_style() # self.set_marker_style() # self.set_marker_color() # self.set_colorbar() # self.set_y_range() # self.set_x_range() # self.set_bg_color() # self.set_legend_loc() self._legend_options = ['Bottom', 'Off', 'Only'] self._line_styles = [ 'lines', 'linesAndMarkers', 'markers', 'sticks', 'vectors' ] self._marker_types = [ 'None', 'Plus', 'X', 'Dot', 'Square', 'Filled Square', 'Circle', 'Filled Circle', 'Up Triangle', 'Filled Up Triangle' ] self._marker_color_codes = [ 'FFFFFF', 'CCCCCC', '999999', '666666', '000000', 'FF0000', 'FF9900', 'FFFF00', '99FF00', '00FF00', '00FF99', '00FFFF', '0099FF', '0000FF', '9900FF', 'FF00FF', 'FF99FF' ] self._marker_colors = [ 'white', 'light grey', 'grey', 'dark grey', 'black', 'red', 'orange', 'yellow', 'light green', 'green', 'blue green', 'cyan', 'blue', 'dark blue', 'purple', 'pink', 'light pink' ] self._colors = dict(zip(self._marker_colors, self._marker_color_codes)) self._continuous_options = ['C', 'D'] self._scale_options = ['Linear', 'Log'] self._colorbars = [ 'BlackBlueWhite', 'BlackGreenWhite', 'BlackRedWhite', 'BlackWhite', 'BlueWhiteRed', 'BlueWideWhiteRed', 'LightRainbow', 'Ocean', 'OceanDepth', 'Rainbow', 'Rainbow2', 'Rainfall', 'ReverseRainbow', 'RedWhiteBlue', 'RedWhiteBlue2', 'RedWideWhiteBlue', 'Spectrum', 'Topography', 'TopographyDepth', 'WhiteBlueBlack', 'WhiteGreenBlack', 'WhiteRedBlack', 'WhiteBlack', 'YellowRed', 'KT_algae', 'KT_amp', 'KT_balance', 'KT_curl', 'KT_deep', 'KT_delta', 'KT_dense', 'KT_gray', 'KT_haline', 'KT_ice', 'KT_matter', 'KT_oxy', 'KT_phase', 'KT_solar', 'KT_speed', 'KT_tempo', 'KT_thermal', 'KT_turbid' ] self._zoom_levels = ['in', 'in2', 'in8', 'out', 'out2', 'out8'] # Set default plotting parameters self.reset_plot_params() @property def client(self): return self._e @property def response(self): return self._e.response @response.setter def response(self, response_type): if response_type not in self._img_types: raise ValueError( 'Invalid image response type specified: {:}'.format( response_type)) self._response = response_type self._e.response = response_type @property def datasets(self): return self._datasets @property def plot_parameters(self): return self._plot_parameters @property def constraints(self): return self._constraints @property def plot_query(self): self.build_plot_query_string() return self._plot_query @property def constraints_query(self): self.build_constraints_query_string() return self._constraints_query @property def last_request(self): return self._last_request @property def image_url(self): return self._image_url @property def colorbars(self): return self._colorbars def fetch_erddap_datasets(self): try: self._logger.info('Fetching available server datasets: {:}'.format( self._erddap_url)) url = self._e.get_search_url(response='csv') self._last_request = url self._logger.debug('Server info: {:}'.format(self._last_request)) self._datasets = pd.read_csv(url) # rename columns more friendly columns = { s: s.replace(' ', '_').lower() for s in self._datasets.columns } self._datasets.rename(columns=columns, inplace=True) # Use dataset_id as the index self._datasets.set_index('dataset_id', inplace=True) except requests.exceptions.HTTPError as e: self._logger.error( 'Failed to fetch/parse ERDDAP server datasets info: {:} ({:})'. format(url, e)) return def set_bg_color(self, color='white'): # .bgColor: value (0xAARRGGBB) if color not in self._colors: return self._plot_parameters.update( {'.bgColor=': '0x{:}'.format(self._colors[color])}) # self._bg_color = {'.bgColor=': '0x{:}'.format(self._colors[color])} def set_colorbar(self, colorbar='Rainbow2', continuous=None, scale=None, min='', max='', num_sections=''): # .colorBar: palette|continuous|scale|min|max|nSections continuous = continuous or self._continuous_options[0] scale = scale or self._scale_options[0] if colorbar not in self._colorbars: return if continuous not in self._continuous_options: return {} if scale not in self._scale_options: return {} self._plot_parameters.update({ '.colorBar=': '{:}|{:}|{:}|{:}|{:}|{:}'.format(colorbar, continuous, scale, min, max, num_sections) }) # self._colorbar = {'.colorBar=': '{:}|{:}|{:}|{:}|{:}|{:}'.format(colorbar, # continuous, # scale, # min, # max, # num_sections)} # # self._update_plot_params() def set_marker_color(self, color='white'): # .color: value (0xAARRGGBB) if color not in self._colors: return {} self._plot_parameters.update( {'.color=': '0x{:}'.format(self._colors[color])}) # self._marker_color = {'.color=': '0x{:}'.format(self._colors[color])} # self._update_plot_params() def set_line_style(self, line_style='markers'): # .draw: value (lines|linesAndMarkers|markers|sticks|vectors) if line_style not in self._line_styles: return {} self._plot_parameters.update({'.draw=': line_style}) # self._line_style = {'.draw=': line_style} # self._update_plot_params() def set_legend_loc(self, location='Bottom'): # .legend: value (Bottom|Off|Only) if location not in self._legend_options: return {} self._plot_parameters.update({'.legend=': location}) # self._legend = {'.legend=': location} # self._update_plot_params() def set_marker_style(self, marker='Circle', marker_size=5): # .marker: markerType|markerSize if marker not in self._marker_types: return {} self._plot_parameters.update({ '.marker=': '{:}|{:}'.format(self._marker_types.index(marker), marker_size) }) # self._marker_style = {'.marker=': '{:}|{:}'.format(self._marker_types.index(marker), marker_size)} # self._update_plot_params() def set_x_range(self, min_val='', max_val='', ascending=True, scale=None): # .xRange: min|max|ascending|scale scale = scale or self._scale_options[0] if scale not in self._scale_options: return {} self._plot_parameters.update({ '.xRange=': '{:}|{:}|{:}|{:}'.format(min_val, max_val, str(ascending).lower(), scale) }) # self._x_range = {'.xRange=': '{:}|{:}|{:}|{:}'.format(min_val, max_val, str(ascending).lower(), scale)} # self._update_plot_params() def set_y_range(self, min_val='', max_val='', ascending=False, scale=None): # .yRange: min|max|ascending|scale scale = scale or self._scale_options[0] if scale not in self._scale_options: return {} self._plot_parameters.update({ '.yRange=': '{:}|{:}|{:}|{:}'.format(min_val, max_val, str(ascending).lower(), scale) }) # self._y_range = {'.yRange=': '{:}|{:}|{:}|{:}'.format(min_val, max_val, str(ascending).lower(), scale)} # self._update_plot_params() def set_zoom(self, zoom_level='in'): if zoom_level not in self._zoom_levels: return {} self._plot_parameters.update({'.zoom=': zoom_level}) # self._zoom_levels = {'.zoom=': zoom_level} # self._update_plot_params() def set_trim_pixels(self, num_pixels=10): self._plot_parameters.update({'.trim=': str(num_pixels)}) # def _update_plot_params(self): # # self._plot_parameters.update(self._line_style) # self._plot_parameters.update(self._marker_style) # self._plot_parameters.update(self._marker_color) # self._plot_parameters.update(self._colorbar) # self._plot_parameters.update(self._y_range) # self._plot_parameters.update(self._x_range) # self._plot_parameters.update(self._bg_color) # self._plot_parameters.update(self._legend) # self._plot_parameters.update(self._zoom) # # self.build_plot_query_string() def add_constraint(self, constraint, constraint_value): self._constraints[constraint] = constraint_value def remove_constraint(self, constraint): if not constraint.endswith('='): constraint = '{:}='.format(constraint) self._constraints.pop(constraint, None) def remove_plot_parameter(self, plot_parameter): if not plot_parameter.endswith('='): plot_parameter = '{:}='.format(plot_parameter) self._plot_parameters.pop(plot_parameter, None) def reset_plot_params(self): self._plot_parameters = self._default_plot_parameters.copy() # self._line_style = {} # self._marker_style = {} # self._marker_color = {} # self._colorbar = {} # self._y_range = {} # self._x_range = {} # self._bg_color = {} # self._legend = {} # Set default plotting parameters # self.set_line_style() # self.set_marker_style() # self.set_marker_color() # self.set_colorbar() # self.set_y_range() # self.set_x_range() # self.set_bg_color() # self.set_legend_loc() # self.set_zoom() # # self.build_plot_query_string() def build_plot_query_string(self): self._plot_query = '&'.join([ '{:}{:}'.format(k, quote(v)) for k, v in self._plot_parameters.items() ]) def build_constraints_query_string(self): self._constraints_query = '&'.join([ '{:}{:}'.format(k, quote(v)) for k, v in self._constraints.items() ]) def build_image_request(self, dataset_id, x, y, c=None): if dataset_id not in self._datasets.index: self._logger.error( 'Dataset ID {:} does not exist'.format(dataset_id)) return variables = [x, y] if c: variables.append(c) self.build_plot_query_string() self.build_constraints_query_string() if self._constraints: url = '{:}/{:}/{:}.{:}?{:}&{:}&{:}'.format( self._e.server, self._e.protocol, dataset_id, self._response, ','.join(variables), self._constraints_query, self._plot_query) else: url = '{:}/{:}/{:}.{:}?{:}&{:}'.format(self._e.server, self._e.protocol, dataset_id, self._response, ','.join(variables), self._plot_query) self._image_url = url return self._image_url def download_image(self, image_url, image_path): image_dir = os.path.dirname(image_path) if not os.path.isdir(image_dir): self._logger.error( 'Invalid image destination specified: {:}'.format(image_dir)) return self._logger.debug('Image url: {:}'.format(image_url)) self._logger.info('Fetching and writing image: {:}'.format(image_path)) r = requests.get(image_url, stream=True) if r.status_code != 200: self._logger.error('{:} (code={:}'.format(r.reason, r.status_code)) return with open(image_path, 'wb') as f: for chunk in r.iter_content(): f.write(chunk) return image_path def __repr__(self): return '<ErddapPlotter(server={:}, response={:}, num_datasets={:})>'.format( self._e.server, self._e.response, len(self._datasets))
# ERDDAP Access: OOI-Net from erddapy import ERDDAP def to_df(url): import pandas as pd return pd.read_csv(url) erd = ERDDAP( server='https://erddap-uncabled.oceanobservatories.org/uncabled/erddap', protocol='tabledap', ) url = erd.get_search_url(search_for='CP01CNSM ctdbp', response='csv') url datasets = to_df(url)['Dataset ID'] datasets # Get a specific dataset info: datasets[7] # ### OMS++ Data Availability # from erddapy import ERDDAP
def get_standard_variables_and_metadata(server_link, standard_variable_list): # Get access to the server and find datasets associated with standard_name variable listed e = ERDDAP(server=server_link, protocol='tabledap', response='csv') # Define Filter for which datasets to look into kw = { 'standard_name': ','.join(standard_variable_list), 'min_lon': -180.0, 'max_lon': 180.0, 'min_lat': -90.0, 'max_lat': 90.0, 'min_time': '', 'max_time': '', 'cdm_data_type': '' } variable_to_groupby = [('latitude', 'degrees_north'), ('longitude', 'degrees_east')] # Get available datasets from that server search_url = e.get_search_url(response='csv', **kw) datasets = pd.read_csv(search_url) # Print results print(e.server) print( str(len(datasets)) + " datasets contains " + ', '.join(standard_variable_list)) # Loop through different data sets and create a metadata dataFrame df = pd.DataFrame(columns=['Dataset ID']) for index, row in datasets.iterrows(): # Get Info from dataset (mostly min/max lat/long) print(row['Dataset ID']) info_url = e.get_info_url(dataset_id=row['Dataset ID'], response='csv') info = pd.read_csv(info_url) attribute_table = info.set_index( ['Row Type', 'Variable Name', 'Attribute Name']).transpose()['attribute'] # Try to get the distinct lat/long and time and depth range for that dataset, if it fails rely on the # ERDDAP metadata try: # If dataset is spread out geographically find distinct locations (may not work well for trajectory data) latlong_url = e.get_download_url( dataset_id=row['Dataset ID'], protocol='tabledap', variables=['latitude', 'longitude', 'time']) # Get add to the url commands to get distinct values and ordered with min and max time for each lat/long distinctMinMaxTime_url = latlong_url + '&distinct()&orderByMinMax(%22latitude%2Clongitude%2Ctime%22)' # Get lat/long and min/max depth for this dataset data = pd.read_csv(distinctMinMaxTime_url, header=[0, 1]) # Group data by latitude/longitude and get min max values data_reduced = data.groupby(by=variable_to_groupby).agg( ['min', 'max']).reset_index() if info[(info['Variable Name'] == 'depth')].size > 0: latlongdepth_url = e.get_download_url( dataset_id=row['Dataset ID'], protocol='tabledap', variables=['latitude', 'longitude', 'depth']) # Get add to the url commands to get distinct values and ordered with min and max depth for # each lat/long distinctMinMaxDepth_url = latlongdepth_url + \ '&distinct()&orderByMinMax(%22latitude%2Clongitude%2Cdepth%22)' # Get lat/long and min/max depth for this dataset data_depth = pd.read_csv(distinctMinMaxDepth_url, header=[0, 1]) # Group depth data by lat/long and get min max values data_depth_reduced = data_depth.groupby( by=variable_to_groupby).agg(['min', 'max']).reset_index() # Merge depth values with time data_reduced = data_reduced.merge(data_depth_reduced, on=variable_to_groupby, how='left') # Merge multi index column names data_reduced.columns = data_reduced.columns.map( ' '.join).str.strip(' ') except Exception as exception_error: print('Failed to read: ' + str(exception_error)) # If there's only one location, it could get the range from metadata # Find lat/long range of this dataset, if it's point we don't need to look into it min_latitude = float(attribute_table['NC_GLOBAL', 'geospatial_lat_min'].Value) max_latitude = float(attribute_table['NC_GLOBAL', 'geospatial_lat_max'].Value) min_longitude = float(attribute_table['NC_GLOBAL', 'geospatial_lon_min'].Value) max_longitude = float(attribute_table['NC_GLOBAL', 'geospatial_lon_max'].Value) # If min/max lat/long are the same don't go in the dataset if (min_latitude == max_latitude) & (min_longitude == max_longitude): data_reduced = pd.DataFrame(columns=['Dataset ID']) data_reduced = {} data_reduced['latitude degrees_north'] = min_latitude data_reduced['longitude degrees_east'] = min_longitude if 'depth' in attribute_table.columns and 'actual_range' in attribute_table[ 'depth'] and ('m' == attribute_table['depth', 'units']['Value']): depth_range = np.array( str.split( attribute_table['depth', 'actual_range']['Value'], ',')).astype(np.float) data_reduced['depth m min'] = depth_range[0] data_reduced['depth m max'] = depth_range[1] # Convert to DataFrame data_reduced = pd.DataFrame(data_reduced, index=[0]) print('Retrieved metadata') else: # Won't handle data with multiple location that it can't retrieve the data continue # Add Standard Name Variable Name to table info['Attribute Name'] == 'geospatial_lat_min' for var in standard_variable_list: data_reduced[var] = ','.join( e.get_var_by_attr(dataset_id=row['Dataset ID'], standard_name=var)) # Add cdm_data_type to table data_reduced['cdm_data_type'] = ','.join( info[info['Attribute Name'] == 'cdm_data_type']['Value'].values) # Add Dataset id to the table data_reduced['Dataset ID'] = row['Dataset ID'] # Merge that dataset ID with previously downloaded data df = df.append(data_reduced) # Add server to dataFrame df['server'] = e.server # Save resulting dataframe to a CSV, file name is based on the server address file_name = re.sub('https*://', '', e.server) file_name = re.sub("[\./]", '_', file_name) file_name = 'Server_List_' + file_name + '.csv' print('Save result to ' + file_name) df.to_csv(file_name) return df
class ErddapReader(Reader): """ This class searches ERDDAP servers. There are 2 known_servers but others can be input too. Attributes ---------- parallel: boolean If True, run with simple parallelization using `multiprocessing`. If False, run serially. known_server: string Two ERDDAP servers are built in to be known to this reader: "ioos" and "coastwatch". e: ERDDAP server instance e.protocol: string * "tabledap" (pandas, appropriate for reading as csv) * "griddap" (xarray, appropriate for reading as netcdf) e.server: string Return the server name columns: list Metadata columns name: string "erddap_ioos", "erddap_coastwatch", or a constructed string if the user inputs a new protocol and server. reader: string reader is defined as "ErddapReader". """ def __init__(self, known_server="ioos", protocol=None, server=None, parallel=True): """ Parameters ---------- known_server: string, optional Two ERDDAP servers are built in to be known to this reader: "ioos" and "coastwatch". protocol, server: string, optional For a user-defined ERDDAP server, input the protocol as one of the following: * "tabledap" (pandas, appropriate for reading as csv) * "griddap" (xarray, appropriate for reading as netcdf) and the server address (such as "http://erddap.sensors.ioos.us/erddap" or "http://coastwatch.pfeg.noaa.gov/erddap"). parallel: boolean If True, run with simple parallelization using `multiprocessing`. If False, run serially. """ self.parallel = parallel # hard wire this for now filetype = "netcdf" # either select a known server or input protocol and server string if known_server == "ioos": protocol = "tabledap" server = "http://erddap.sensors.ioos.us/erddap" filetype = "netcdf" # other option: "csv" elif known_server == "coastwatch": protocol = "griddap" server = "http://coastwatch.pfeg.noaa.gov/erddap" filetype = "netcdf" # other option: "csv" elif known_server is not None: statement = ( "either select a known server or input protocol and server string" ) assert (protocol is not None) & (server is not None), statement else: known_server = urllib.parse.urlparse(server).netloc # known_server = server.strip("/erddap").strip("http://").replace(".", "_") statement = ( "either select a known server or input protocol and server string" ) assert (protocol is not None) & (server is not None), statement self.known_server = known_server self.e = ERDDAP(server=server) self.e.protocol = protocol self.e.server = server self.filetype = filetype # columns for metadata self.columns = [ "geospatial_lat_min", "geospatial_lat_max", "geospatial_lon_min", "geospatial_lon_max", "time_coverage_start", "time_coverage_end", "defaultDataQuery", "subsetVariables", # first works for timeseries sensors, 2nd for gliders "keywords", # for hf radar "id", "infoUrl", "institution", "featureType", "source", "sourceUrl", ] # name self.name = f"erddap_{known_server}" self.reader = "ErddapReader" self.store = dict() def __getitem__(self, key): """Redefinition of dict-like behavior. This enables user to use syntax `reader[dataset_id]` to read in and save dataset into the object. Parameters ---------- key: str dataset_id for a dataset that is available in the search/reader object. Returns ------- xarray Dataset of the data associated with key """ returned_data = self.data_by_dataset(key) # returned_data = self._return_data(key) self.__setitem__(key, returned_data) return returned_data def find_dataset_id_from_station(self, station): """Find dataset_id from station name. Parameters ---------- station: string Station name for which to search for dataset_id """ if station is None: return None # for station in self._stations: # if station has more than one word, AND will be put between # to search for multiple terms together. url = self.e.get_search_url(response="csv", items_per_page=5, search_for=station) try: df = pd.read_csv(url) except Exception as e: logger.exception(e) logger.warning( f"search url {url} did not work for station {station}.") return # first try for exact station match try: # Special case for TABS when don't split the id name if "tabs" in station: # don't split dataset_id = [ dataset_id for dataset_id in df["Dataset ID"] if station.lower() == dataset_id.lower() ][0] else: # first try as dataset_id then do as station name dataset_id = [ dataset_id for dataset_id in df["Dataset ID"] if station.lower() in [dataset_id.lower()] + dataset_id.lower().split("_") ][0] except Exception as e: logger.exception(e) logger.warning( "When searching for a dataset id to match station name %s, the first attempt to match the id did not work." % (station)) # If that doesn't work, return None for dataset_id dataset_id = None # # if that doesn't work, trying for more general match and just take first returned option # dataset_id = df.iloc[0]["Dataset ID"] return dataset_id @property def dataset_ids(self): """Find dataset_ids for server. Notes ----- The dataset_ids are found by querying the metadata through the ERDDAP server. The number of dataset_ids can change if a variable is removed from the list of variables and this is rerun. """ if not hasattr(self, "_dataset_ids") or ( self.variables and (len(self.variables) != self.num_variables)): # This should be a region search if self.approach == "region": # find all the dataset ids which we will use to get the data # This limits the search to our keyword arguments in kw which should # have min/max lon/lat/time values dataset_ids = [] if self.variables is not None: for variable in self.variables: # find and save all dataset_ids associated with variable search_url = self.e.get_search_url( response="csv", **self.kw, variableName=variable, items_per_page=10000, ) try: search = pd.read_csv(search_url) dataset_ids.extend(search["Dataset ID"]) except Exception as e: logger.exception(e) logger.warning( f"variable {variable} was not found in the search" ) logger.warning(f"search_url: {search_url}") else: # find and save all dataset_ids associated with variable search_url = self.e.get_search_url(response="csv", **self.kw, items_per_page=10000) try: search = pd.read_csv(search_url) dataset_ids.extend(search["Dataset ID"]) except Exception as e: logger.exception(e) logger.warning("nothing found in the search") logger.warning(f"search_url: {search_url}") # only need a dataset id once since we will check them each for all standard_names self._dataset_ids = list(set(dataset_ids)) # This should be a search for the station names elif self.approach == "stations": # search by station name for each of stations if self.parallel: # get metadata for datasets # run in parallel to save time num_cores = multiprocessing.cpu_count() dataset_ids = Parallel(n_jobs=num_cores)( delayed(self.find_dataset_id_from_station)(station) for station in self._stations) else: dataset_ids = [] for station in self._stations: dataset_ids.append( self.find_dataset_id_from_station(station)) # remove None from list dataset_ids = [i for i in dataset_ids if i] # In this case return all dataset_ids so they match 1-1 with # the input station list. self._dataset_ids = dataset_ids else: logger.warning( "Neither stations nor region approach were used in function dataset_ids." ) # update number of variables if self.variables: self.num_variables = len(self.variables) return self._dataset_ids def meta_by_dataset(self, dataset_id): """Return the catalog metadata for a single dataset_id.""" info_url = self.e.get_info_url(response="csv", dataset_id=dataset_id) try: info = pd.read_csv(info_url) except Exception as e: logger.exception(e) logger.warning(f"Could not read info from {info_url}") return {dataset_id: []} items = [] for col in self.columns: try: item = info[info["Attribute Name"] == col]["Value"].values[0] dtype = info[info["Attribute Name"] == col]["Data Type"].values[0] except: if col == "featureType": # this column is not present in HF Radar metadata but want it to # map to data_type, so input 'grid' in that case. item = "grid" else: item = "NA" if dtype == "String": pass elif dtype == "double": item = float(item) elif dtype == "int": item = int(item) items.append(item) # include download link ## self.e.dataset_id = dataset_id if self.e.protocol == "tabledap": # set the same time restraints as before self.e.constraints = { "time<=": self.kw["max_time"], "time>=": self.kw["min_time"], } if self.filetype == "csv": download_url = self.e.get_download_url(response="csvp") elif self.filetype == "netcdf": download_url = self.e.get_download_url(response="ncCf") elif self.e.protocol == "griddap": # the search terms that can be input for tabledap do not work for griddap # in erddapy currently. Instead, put together an opendap link and then # narrow the dataset with xarray. # get opendap link download_url = self.e.get_download_url(response="opendap") # check if "prediction" is present in metadata, esp in case of NOAA # model predictions is_prediction = "Prediction" in " ".join( list(info["Value"].replace(np.nan, None).values)) # add erddap server name return { dataset_id: [self.e.server, download_url, info_url, is_prediction] + items + [self.variables] } @property def meta(self): """Rearrange the individual metadata into a dataframe. Notes ----- This should exclude duplicate entries. """ if not hasattr(self, "_meta"): if self.parallel: # get metadata for datasets # run in parallel to save time num_cores = multiprocessing.cpu_count() downloads = Parallel(n_jobs=num_cores)( delayed(self.meta_by_dataset)(dataset_id) for dataset_id in self.dataset_ids) else: downloads = [] for dataset_id in self.dataset_ids: downloads.append(self.meta_by_dataset(dataset_id)) # make dict from individual dicts from collections import ChainMap meta = dict(ChainMap(*downloads)) # Make dataframe of metadata # variable names are the column names for the dataframe self._meta = pd.DataFrame.from_dict( meta, orient="index", columns=[ "database", "download_url", "info_url", "is_prediction" ] + self.columns + ["variable names"], ) return self._meta def data_by_dataset(self, dataset_id): """Return the data for a single dataset_id. Returns ------- A tuple of (dataset_id, data), where data type is a pandas DataFrame Notes ----- Data is read into memory. """ if self.filetype == "csv": # if self.e.protocol == "tabledap": try: # fetch metadata if not already present # found download_url from metadata and use self.e.dataset_id = dataset_id # dataset_vars gives a list of the variables in the dataset dataset_vars = (self.meta.loc[dataset_id] ["defaultDataQuery"].split("&")[0].split(",")) # vars_present gives the variables in self.variables # that are actually in the dataset vars_present = [] for selfvariable in self.variables: vp = [var for var in dataset_vars if var == selfvariable] if len(vp) > 0: vars_present.append(vp[0]) # If any variables are not present, this doesn't work. if self.variables is not None: self.e.variables = [ "time", "longitude", "latitude", "station", ] + vars_present dd = self.e.to_pandas(response="csvp", index_col=0, parse_dates=True) # dd = self.e.to_pandas(response='csv', header=[0, 1], # index_col=0, parse_dates=True) # dd = pd.read_csv( # download_url, header=[0, 1], index_col=0, parse_dates=True # ) # Drop cols and rows that are only NaNs. dd = dd.dropna(axis="index", how="all").dropna(axis="columns", how="all") if self.variables is not None: # check to see if there is any actual data # this is a bit convoluted because the column names are the variable names # plus units so can't match 1 to 1. datacols = ( 0 # number of columns that represent data instead of metadata ) for col in dd.columns: datacols += [ varname in col for varname in self.variables ].count(True) # if no datacols, we can skip this one. if datacols == 0: dd = None except Exception as e: logger.exception(e) logger.warning("no data to be read in for %s" % dataset_id) dd = None elif self.filetype == "netcdf": # elif self.e.protocol == "griddap": if self.e.protocol == "tabledap": try: # assume I don't need to narrow in space since time series (tabledap) self.e.dataset_id = dataset_id dd = self.e.to_xarray() # dd = xr.open_dataset(download_url, chunks="auto") dd = dd.swap_dims({"obs": dd.cf["time"].name}) dd = dd.sortby(dd.cf["time"], ascending=True) dd = dd.cf.sel( T=slice(self.kw["min_time"], self.kw["max_time"])) # dd = dd.set_coords( # [dd.cf["longitude"].name, dd.cf["latitude"].name] # ) # use variable names to drop other variables (should. Ido this?) if self.variables is not None: # I don't think this is true with new approach # # ERDDAP prepends variables with 's.' in netcdf files, # # so include those with variables # erd_vars = [f's.{var}' for var in self.variables] # var_list = set(dd.data_vars) - (set(self.variables) | set(erd_vars)) var_list = set(dd.data_vars) - set(self.variables) dd = dd.drop_vars(var_list) # the lon/lat are on the 'timeseries' singleton dimension # but the data_var variable was not, which messed up # cf-xarray. When longitude and latitude are not on a # dimension shared with a variable, the variable can't be # called with cf-xarray. e.g. dd.cf['ssh'] won't work. if "timeseries" in dd.dims: for data_var in dd.data_vars: if "timeseries" not in dd[data_var].dims: dd[data_var] = dd[data_var].expand_dims( dim="timeseries", axis=1) except Exception as e: logger.exception(e) logger.warning("no data to be read in for %s" % dataset_id) dd = None elif self.e.protocol == "griddap": try: # this makes it read in the whole file which might be large self.e.dataset_id = dataset_id # dd = self.e.to_xarray(chunks="auto").sel( # time=slice(self.kw["min_time"], self.kw["max_time"]) # ) download_url = self.e.get_download_url(response="opendap") dd = xr.open_dataset(download_url, chunks="auto").sel( time=slice(self.kw["min_time"], self.kw["max_time"])) if ("min_lat" in self.kw) and ("max_lat" in self.kw): dd = dd.sel(latitude=slice(self.kw["min_lat"], self.kw["max_lat"])) if ("min_lon" in self.kw) and ("max_lon" in self.kw): dd = dd.sel(longitude=slice(self.kw["min_lon"], self.kw["max_lon"])) # use variable names to drop other variables (should. Ido this?) if self.variables is not None: vars_list = set(dd.data_vars) - set(self.variables) dd = dd.drop_vars(vars_list) except Exception as e: logger.exception(e) logger.warning("no data to be read in for %s" % dataset_id) dd = None # return (dataset_id, dd) return dd # @property def data(self, dataset_ids=None): """Read in data for some or all dataset_ids. NOT USED CURRENTLY Once data is read in for a dataset_ids, it is remembered. See full documentation in `utils.load_data()`. """ output = odg.utils.load_data(self, dataset_ids) return output
from erddapy import ERDDAP import pandas as pd e = ERDDAP(server = url_glider) # Search constraints kw2018 = { 'min_lon': lon_lim[0], 'max_lon': lon_lim[1], 'min_lat': lat_lim[0], 'max_lat': lat_lim[1], 'min_time': date_ini, 'max_time': date_end, } search_url = e.get_search_url(response='csv', **kw2018) search = pd.read_csv(search_url) # Extract the IDs gliders = search['Dataset ID'].values #%% dataset_id = gliders[0] print(glid) # timeg,depthg_gridded,varg_gridded,timem,depthm,target_varm = \ # glider_transect_model_com_erddap_server(url_glider,dataset_id,url_model,\ # lat_lim,lon_lim,\ # date_ini,date_end,var_glider,var_model,model_name,delta_z=0.4)
class NDBC(): def __init__(self, station_id, deploy_id, WMO, currentTime, startTime, data_map, name_map): self.station_id = station_id self.deploy_id = deploy_id self.WMO = WMO self.now = currentTime self.startTime = startTime self.data_map = data_map self.name_map = name_map def adjust_pressure_to_sea_level(self, pres, temp, height): """Adjust barometric presure to sea-level.""" temp = temp + 273.15 slp = pres / np.exp(-height / (temp * 29.263)) return slp def calculate_wind_speed(self, eastward, northward): """Calculate absolute wind speed from component wind vector.""" u = np.square(eastward) v = np.square(northward) wind_speed = np.sqrt(u + v) return wind_speed def calculate_wind_direction(self, eastward, northward): """Calculate met wind direction from component wind vectors.""" u = eastward v = northward wind_direction = 180 / np.pi * np.arctan2(-u, -v) return wind_direction def _connect_erddap(self, server="http://ooivm1.whoi.net/erddap", protocol="tabledap"): """Connect to the erddap server.""" self._erddap = ERDDAP(server=server, protocol=protocol) def list_datasets(self): """Get the available datasets for the ERDDAP server.""" # First, make the connection self._connect_erddap() # Next, get the datasets datasets = pd.read_csv( self._erddap.get_search_url(search_for=self.station_id, response='csv'))['Dataset ID'] return datasets def get_dataset(self, dataset): """Get the data for specified datasets.""" # First, have to re-establish the erddap connection self._connect_erddap() # Next, get the data for a dataset self._erddap.dataset_id = dataset # Only want the variables with standard names variables = self._erddap.get_var_by_attr( standard_name=lambda v: v is not None) self._erddap.variables = variables # Limit the data request to the current deployment self._erddap.constraints = { 'deploy_id=': self.deploy_id, 'time>=': self.startTime.strftime('%Y-%m-%dT%H:%M:%SZ') } try: # Download the data data = self._erddap.to_pandas(index_col='time (UTC)', parse_dates=True) # Sometimes it just returns an empty dataframe instead of an error if data.size == 0: data = self._create_empty_dataset() except: # If there is no available data in the requested time window, need # to create an empty dataframe of the data data = self._create_empty_dataset() # Return the dataset data return data def process_METBK_data(self, df, freq='10T'): """Process the METBK into the correct format and values for NDBC.""" # Resample the data df_binned = df.resample(freq).mean() # Check that barometric pressure if 'barometric_pressure (mbar)' in df_binned.columns: # Adjust the barometric pressure to sea-level df_binned[ 'sea_level_pressure (hPa)'] = self.adjust_pressure_to_sea_level( df_binned['barometric_pressure (mbar)'], df_binned['air_temperature (degree_Celsius)'], 4.05) else: df_binned['sea_level_pressure (hPa)'] = np.nan # Check that the wind vector components are in the dataframe if 'eastward_wind_velocity (m s-1)' in df_binned.columns: # Calculate the wind speed df_binned['wind speed (m/s)'] = self.calculate_wind_speed( df_binned['eastward_wind_velocity (m s-1)'], df_binned['northward_wind_velocity (m s-1)']) # Calculate the wind direction df_binned['wind direction'] = self.calculate_wind_direction( df_binned['eastward_wind_velocity (m s-1)'], df_binned['northward_wind_velocity (m s-1)']) df_binned['wind direction'] = df_binned["wind direction"].apply( lambda x: x + 360 if x < 0 else x) # Don't need cardinal direction -> want direction in degrees # df_binned["wind direction"] = df_binned["wind direction"].apply( # lambda x: self.get_cardinal_direction(np.round(x, decimals=2))) else: df_binned['wind speed (m/s)'] = np.nan df_binned['wind direction'] = np.nan # Return the processed data return df_binned def process_WAVSS_data(self, df, freq='10T'): """Much simpler function for processing the WAVSS data.""" # Resample the data df_binned = df.resample(freq).mean() # Return the data return df_binned def _create_empty_dataset(self): """ Create a dataset of all nans if there is no data available for the requested dataset in the given time period. """ # Get the units for the corresponding variables info_url = self._erddap.get_info_url( dataset_id=self._erddap.dataset_id, response='csv') info = pd.read_csv(info_url) units = info[info['Attribute Name'] == 'units'] # Now, add the units to the variable names columns = [] for var in self._erddap.variables: unit = units[units['Variable Name'] == var]['Value'].values if len(unit) == 0: columns.append(f'{var}') elif var == 'time': pass else: columns.append(f'{var} ({unit[0]})') # Create an array of nans to fill out the empty dataframe empty_array = np.empty((2, len(columns))) empty_array[:] = np.nan # Put the empty array into a dataframe empty_df = pd.DataFrame(data=empty_array, columns=columns, index=[self.startTime, self.now]) empty_df.index.name = 'time (UTC)' return empty_df def process_datasets(self, datasets): """Process the data for individual datasets.""" self.datasets = datasets # Get the data for the individual datasets for dset in self.datasets.keys(): self.datasets.update({dset: self.get_dataset(dset)}) # Process the data for dset in self.datasets.keys(): if 'METBK' in dset: self.datasets[dset] = self.process_METBK_data( self.datasets[dset]) else: self.datasets[dset] = self.process_WAVSS_data( self.datasets[dset]) # Add a header to the data in the datasets for key in self.datasets.keys(): header = key.split('-', 2)[-1] for col in self.datasets.get(key).columns: self.datasets.get(key).rename( columns={col: ' '.join((header, col))}, inplace=True) def parse_data_to_xml(self, data): """ Function which takes in the 10-minute average buoy data, the station name, and two dictionaries which map the buoy column names to the xml tags, and outputs an xml file in the NDBC format. Returns: xml - a properly constructed xml file in the NDBC format for the given buoy data """ # Start the xml file xml = ['<?xml version="1.0" encoding="ISO-8859-1"?>'] # Iterate through the data for index in data.index: # Get the data associated with a row in the dataframe row = data.loc[index] # Reset a dictionary of the data xml_data = {} for key in self.data_map.keys(): xml_data.update({key: self.data_map.get(key)}) # Parse the data into the data dictionary for key in xml_data.keys(): # Get the column name which corresponds to the ndbc tag column = self.name_map.get(key) # Check that the column was returned from the ERDDAP server if column in row.index: value = row[column] # If a nan, just leave it the default -9999 if str(value) == 'nan': pass else: xml_data[key] = value # If no data, leave it as default -9999 else: pass # Write the parsed data to the xml file # Start the message xml.append('<message>') # Add in the station id xml.append(f' <station>{self.WMO}</station>') # Get the time index time = row.name.strftime('%m/%d/%Y %H:%M:%S') xml.append(f' <date>{time}</date>') # Missing fill value missing = str(-9999) xml.append(f' <missing>{missing}</missing>') # Roundtime xml.append(' <roundtime>no</roundtime>') # Start of the data xml.append(' <met>') # Add in each data piece for tag in xml_data.keys(): # Get the value value = xml_data.get(tag) value = str(value) # Add the data to the xml file xml.append(f' <{tag}>{value}</{tag}>') # Finish off the message xml.append(' </met>') xml.append('</message>') # Return the results return xml
'latitude>=': 37.0, 'latitude<=': 43.43, 'longitude>=': 317.56, 'longitude<=': 322.87, } ###specifying the variables(columns name) to be retrived. e.variables = [ 'sample', 'latitude', 'longitude', 'life_stage', 'abundance', 'time', ] ### searching for the server link and doing the handshaking process. search_url = e.get_search_url(response='csv') ### receiving requested data and saving it into a dataframe search = pd.read_csv(search_url) df = e.to_pandas() ### receiving current working directory and saving the dataframe into a single csv file in that path. wd = os.getcwd() df.to_csv(wd + '/DataFiles/plankton_swocecpr.csv') #%% """ ######################################################################################################################### ##Read the bio/none_bio datafile for plankton data and filter them based on Depth and total wet mass and place a point regarding ##for each data point lon and lat into a map. ######################################################################################################################### """ import os
# # Exploring an ERDDAP server # In[5]: from erddapy import ERDDAP e = ERDDAP(server='https://erddap-uncabled.oceanobservatories.org/uncabled/erddap') # In[6]: import pandas as pd df = pd.read_csv(e.get_search_url(response='csv', search_for='all')) # In[7]: 'We have {} tabledap, {} griddap, and {} wms endpoints.'.format( len(set(df['tabledap'].dropna())), len(set(df['griddap'].dropna())), len(set(df['wms'].dropna())) ) # # ERDDAP Advanced Search # # Let's narrow the search area, time span, and look for *sea_water_temperature* only.
def list_data(self, verbose=False): e = ERDDAP(server=self.server_url) self.df = pd.read_csv( e.get_search_url(response='csv', search_for=self.glider_id)) if verbose: print(self.df['Dataset ID'])
def active_gliders(bbox=None, time_start=None, time_end=dt.date.today(), glider_id=None): bbox = bbox or [-100, -40, 18, 60] time_start = time_start or (time_end - dt.timedelta(days=1)) t0 = time_start.strftime('%Y-%m-%dT%H:%M:%SZ') t1 = time_end.strftime('%Y-%m-%dT%H:%M:%SZ') glider_id = glider_id or None e = ERDDAP(server='NGDAC') # Grab every dataset available # datasets = pd.read_csv(e.get_search_url(response='csv', search_for='all')) # Search constraints kw = dict() kw['min_time'] = t0 kw['max_time'] = t1 if bbox: kw['min_lon'] = bbox[0] kw['max_lon'] = bbox[1] kw['min_lat'] = bbox[2] kw['max_lat'] = bbox[3] if glider_id: search = glider_id else: search = None search_url = e.get_search_url(search_for=search, response='csv', **kw) try: # Grab the results search = pd.read_csv(search_url) except: # return empty dataframe if there are no results return pd.DataFrame() # Extract the IDs gliders = search['Dataset ID'].values msg = 'Found {} Glider Datasets:\n\n{}'.format print(msg(len(gliders), '\n'.join(gliders))) # Setting constraints constraints = { 'time>=': t0, 'time<=': t1, 'longitude>=': bbox[0], 'longitude<=': bbox[1], 'latitude>=': bbox[2], 'latitude<=': bbox[3], } variables = [ 'depth', 'latitude', 'longitude', 'time', 'temperature', 'salinity', ] e = ERDDAP( server='NGDAC', protocol='tabledap', response='nc' ) glider_dfs = [] for id in gliders: # print('Reading ' + id) e.dataset_id = id e.constraints = constraints e.variables = variables # checking data frame is not empty try: df = e.to_pandas( index_col='time (UTC)', parse_dates=True, skiprows=(1,) # units information can be dropped. ).dropna() except: continue df = df.reset_index() df['dataset_id'] = id df = df.set_index(['dataset_id', 'time (UTC)']) glider_dfs.append(df) try: ndf = pd.concat(glider_dfs) except ValueError: return pd.DataFrame() return ndf
class GdacClient(object): def __init__(self, erddap_url=None): self._logger = logging.getLogger(os.path.basename(__file__)) self._erddap_url = erddap_url or 'https://gliders.ioos.us/erddap' self._protocol = 'tabledap' self._response_type = 'csv' self._items_per_page = 1e10 self._page = 1 self._client = ERDDAP(server=self._erddap_url, protocol=self._protocol, response=self._response_type) self._last_request = None # DataFrame containing the results of ERDDAP advanced search (endpoints, etc.) self._datasets_info = pd.DataFrame() # DataFrame containing dataset_id, start/end dates, profile count, etc. self._datasets_summaries = pd.DataFrame() self._datasets_profiles = pd.DataFrame() self._datasets_days = pd.DataFrame() self._profiles_variables = [ 'time', 'latitude', 'longitude', 'profile_id', 'wmo_id' ] self._valid_search_kwargs = { 'institution', 'ioos_category', 'long_name', 'standard_name', 'variable_name', 'min_lon', 'min_lat', 'max_lon', 'max_lat', 'min_time', 'max_time' } self._months = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ] self._calendar_types = ['datasets', 'days', 'profiles'] @property def datasets_info(self): return self._datasets_info @property def datasets_summaries(self): return self._datasets_summaries @property def datasets_profiles(self): return self._datasets_profiles @property def datasets_days(self): return self._datasets_days @property def dataset_ids(self): if self._datasets_summaries.empty: self._logger.warning('No data sets found') return return list(self._datasets_info['dataset_id'].values) @property def gliders(self): if self._datasets_summaries.empty: self._logger.warning('No data sets found') return return list(self._datasets_summaries.glider.unique()) @property def profiles_per_yyyymmdd(self): return self._datasets_profiles.sum(axis=1) @property def profiles_per_year(self): return self._datasets_profiles.sum( axis=1).groupby(lambda x: x.year).sum() @property def glider_days_per_yyyymmdd(self): return self._datasets_days.sum(axis=1) @property def glider_days_per_year(self): return self._datasets_days.sum(axis=1).groupby(lambda x: x.year).sum() @property def deployments_per_yyyymmdd(self): return self._datasets_days.sum(axis=1) @property def deployments_per_year(self): return self._datasets_days.groupby(lambda x: x.year).any().sum(axis=1) @property def yearly_counts(self): columns = [ self.deployments_per_year, self.glider_days_per_year, self.profiles_per_year ] totals = pd.DataFrame(columns).transpose().astype('i') totals.columns = ['deployments', 'glider days', 'profiles'] totals.index.name = 'year' return totals @property def e(self): """erddapy.ERDDAP client""" return self._client @property def server(self): return self._client.server @property def response_type(self): return self._client.response @response_type.setter def response_type(self, response_type): self._client.response = response_type @property def last_request(self): return self._last_request def get_glider_datasets(self, glider): return self._datasets_summaries[self._datasets_summaries.glider == glider].reset_index().drop('index', axis=1) def get_deployments_calendar(self, year=None): if not year: return self._datasets_days.groupby( [lambda x: x.year, lambda x: x.month]).any().sum(axis=1).unstack() else: glider_days_by_yymmdd = self._datasets_days years = pd.to_datetime(glider_days_by_yymmdd.index).year.unique() if year not in years: self._logger.warning( 'No glider days found in year {:}'.format(year)) return pd.DataFrame() return glider_days_by_yymmdd[pd.to_datetime( glider_days_by_yymmdd.index).year == year].groupby( [lambda x: x.month, lambda x: x.day]).any().sum(axis=1).unstack() def get_glider_days_calendar(self, year=None): if not year: return self._datasets_days.sum(axis=1).groupby( [lambda x: x.year, lambda x: x.month]).sum().unstack() else: glider_days_by_yymmdd = self._datasets_days.sum(axis=1) years = pd.to_datetime(glider_days_by_yymmdd.index).year.unique() if year not in years: self._logger.warning( 'No glider days found in year {:}'.format(year)) return pd.DataFrame() return glider_days_by_yymmdd[pd.to_datetime( glider_days_by_yymmdd.index).year == year].groupby( [lambda x: x.month, lambda x: x.day]).sum().unstack() def get_profiles_calendar(self, year=None): if not year: return self._datasets_profiles.sum(axis=1).groupby( [lambda x: x.year, lambda x: x.month]).sum().unstack() else: profiles_by_yymmdd = self._datasets_profiles.sum(axis=1) years = pd.to_datetime(profiles_by_yymmdd.index).year.unique() if year not in years: self._logger.warning( 'No profiles found in year {:}'.format(year)) return pd.DataFrame() return profiles_by_yymmdd[pd.to_datetime( profiles_by_yymmdd.index).year == year].groupby( [lambda x: x.month, lambda x: x.day]).sum().unstack() def search_datasets(self, search_for=None, delayedmode=False, **kwargs): """Search the ERDDAP server for glider deployment datasets. Results are stored as pandas DataFrames in: self.deployments self.datasets Equivalent to ERDDAP's Advanced Search. Searches can be performed by free text, bounding box, time bounds, etc. See the erddapy documentation for valid kwargs""" url = self._client.get_search_url(search_for=search_for, **kwargs) self._last_request = url glider_regex = re.compile(r'^(.*)-\d{8}T\d{4}') try: self._datasets_info = pd.read_csv(url) # Drop the allDatasets row self._datasets_info.drop(self._datasets_info[ self._datasets_info['Dataset ID'] == 'allDatasets'].index, inplace=True) # Reset the index to start and 0 self._datasets_info.reset_index(inplace=True) # Drop the index, griddap wms columns self._datasets_info.drop(['index', 'griddap', 'wms'], axis=1, inplace=True) # rename columns more friendly columns = { s: s.replace(' ', '_').lower() for s in self._datasets_info.columns } self._datasets_info.rename(columns=columns, inplace=True) if not delayedmode: self._datasets_info = self._datasets_info[ ~self._datasets_info.dataset_id.str.endswith('delayed')] # Iterate through each data set (except for allDatasets) and grab the info page datasets = [] daily_profiles = [] datasets_days = [] for i, row in self._datasets_info.iterrows(): if row['dataset_id'] == 'allDatasets': continue if delayedmode and not row['dataset_id'].endswith('delayed'): continue elif row['dataset_id'].endswith('delayed'): continue self._logger.info('Fetching dataset: {:}'.format( row['dataset_id'])) # Get the data download url for erddap_vars try: data_url = self._client.get_download_url( dataset_id=row['dataset_id'], variables=self._profiles_variables) except (ConnectionError, ConnectionRefusedError, urllib3.exceptions.MaxRetryError) as e: self._logger.error('{:} fetch failed: {:}'.format( row['dataset_id'], e)) continue # Fetch the profiles into a pandas dataframe try: profiles = pd.read_csv(data_url, skiprows=[1], index_col='time', parse_dates=True).sort_index() except HTTPError as e: self._logger.error( 'Failed to fetch profiles: {:}'.format(e)) continue # Group profiles by yyyy-mm-dd and sum the number of profiles per day s = profiles.profile_id.dropna().groupby( lambda x: x.date).count() s.name = row['dataset_id'] daily_profiles.append(s) # Create the deployment date range d_index = pd.date_range(s.index.min(), s.index.max()) deployment_days = pd.Series([1 for x in d_index], index=d_index, name=row['dataset_id']) datasets_days.append(deployment_days) glider_match = glider_regex.match(row['dataset_id']) glider = glider_match.groups()[0] # First profile time dt0 = profiles.index.min() # Last profile time dt1 = profiles.index.max() # Deployment length in days days = ceil((dt1 - dt0).total_seconds() / 86400) dataset_summary = [ glider, row['dataset_id'], str(profiles.wmo_id.unique()[0]), dt0, dt1, profiles.iloc[0]['latitude'], profiles.iloc[0]['longitude'], profiles.latitude.min(), profiles.latitude.max(), profiles.longitude.min(), profiles.longitude.max(), profiles.shape[0], days ] datasets.append(dataset_summary) columns = [ 'glider', 'dataset_id', 'wmo_id', 'start_date', 'end_date', 'deployment_lat', 'deployment_lon', 'lat_min', 'lat_max', 'lon_min', 'lon_max', 'num_profiles', 'days' ] self._datasets_summaries = pd.DataFrame(datasets, columns=columns) # Create and store the DataFrame containing a 1 on each day the glider was deployed, 0 otherwise self._datasets_days = pd.concat(datasets_days, axis=1).sort_index() # Create and store the DataFrame containing the number of profiles on each day for each deployment self._datasets_profiles = pd.concat(daily_profiles, axis=1).sort_index() except HTTPError as e: self._logger.error(e) return def get_dataset_info(self, dataset_id): """Fetch the dataset metadata for the specified dataset_id""" if dataset_id not in self.dataset_ids: self._logger.error('Dataset id {:} not found in {:}'.format( dataset_id, self.__repr__())) return info = self._datasets_info[self._datasets_info.dataset_id == dataset_id] info.reset_index(inplace=True) return info.drop('index', axis=1).transpose() def get_dataset_profiles(self, dataset_id): """Fetch all profiles (time, latitude, longitude, profile_id) for the specified dataset. Profiles are sorted by ascending time""" if dataset_id not in self.dataset_ids: self._logger.error('Dataset id {:} not found in {:}'.format( dataset_id, self.__repr__())) return url = self._client.get_download_url(dataset_id=dataset_id, variables=self._profiles_variables) return pd.read_csv(url, parse_dates=True, skiprows=[1], index_col='time').sort_index() def get_dataset_time_coverage(self, dataset_id): """Get the time coverage and wmo id (if specified) for specified dataset_id """ if dataset_id not in self.dataset_ids: self._logger.error('Dataset id {:} not found in {:}'.format( dataset_id, self.__repr__())) return return self._datasets_summaries[[ 'dataset_id', 'start_date', 'end_date', 'wmo_id' ]].iloc[self.dataset_ids.index(dataset_id)] def get_dataset_time_series(self, dataset_id, variables, min_time=None, max_time=None): """Fetch the variables time-series for the specified dataset_id. A time window can be specified using min_time and max_time, which must be ISO-8601 formatted date strings (i.e.: 'YYYY-mm-ddTHH:MM') Parameters dataset_id: valid dataset id from self.datasets variables: list of one or more valid variables in the dataset Options min_time: minimum time value formatted as 'YYYY-mm-ddTHH:MM[:SS]' max_time: maximum time value formatted as 'YYYY-mm-ddTHH:mm[:SS]' """ if dataset_id not in self.dataset_ids: self._logger.error('Dataset id {:} not found in {:}'.format( dataset_id, self.__repr__())) return if not isinstance(variables, list): variables = [variables] all_variables = ['precise_time', 'time', 'depth'] + variables variables = set(all_variables) constraints = {} if min_time: constraints['precise_time>='] = min_time if max_time: constraints['precise_time<='] = max_time # Not sure why, but pd.read_csv doesn't like percent UNENCODED urls on data requests, so percent escape special # characters prior to sending the data request. data_url = self.encode_url( self._client.get_download_url(dataset_id=dataset_id, variables=variables, constraints=constraints)) return pd.read_csv( data_url, skiprows=[1], parse_dates=True).set_index('precise_time').sort_index() def plot_yearly_totals(self, totals_type=None, palette='Blues_d', **kwargs): """Bar chart plot of deployments, glider days and profiles, grouped by year""" totals = self.yearly_counts.reset_index() if totals_type and totals_type not in totals.columns: self._logger.error( 'Invalid category specified: {:}'.format(totals_type)) return if not totals_type: fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(8.5, 11), sharex=True) sns.barplot(x='year', y='deployments', ax=ax1, data=totals, palette=palette, **kwargs) sns.barplot(x='year', y='glider days', ax=ax2, data=totals, palette=palette, **kwargs) sns.barplot(x='year', y='profiles', ax=ax3, data=totals, palette=palette, **kwargs) ax2.set_xlabel('') ax1.set_xlabel('') ax1.set_title('U.S. IOOS Glider Data Assembly Center') return fig, ax1, ax2, ax3 else: ax = sns.barplot(x='year', y=totals_type, data=totals, palette=palette, **kwargs) ax.set_title('U.S. IOOS Glider Data Assembly Center') return ax.figure, ax def plot_datasets_calendar(self, calendar_type, year=None, cmap=None): """Heatmap of the specified calendar_type""" if calendar_type not in self._calendar_types: self._logger.error( 'Invalid calendar type specified: {:}'.format(calendar_type)) return if calendar_type == 'datasets': if not year: data = self.get_deployments_calendar() title = 'Active Real-Time Datasets' else: data = self.get_deployments_calendar(year) title = 'Active Real-Time Datasets: {:}'.format(year) elif calendar_type == 'days': if not year: data = self.get_glider_days_calendar() data.columns = self._months title = 'Glider In-Water Days' else: data = self.get_glider_days_calendar(year) title = 'Glider In-Water Days: {:}'.format(year) elif calendar_type == 'profiles': if not year: data = self.get_profiles_calendar() data.columns = self._months title = 'Real-Time Profiles' else: data = self.get_profiles_calendar(year) title = 'Real-Time Profiles: {:}'.format(year) else: self._logger.error( 'Unknown calendar type: {:}'.format(calendar_type)) return if data.empty: self._logger.warning('No results found') return if year: data.index = self._months plt.figure(figsize=(8.5, 4.)) cb = True annotate = False else: data.columns = self._months plt.figure(figsize=(8.5, 8.5)) cb = False annotate = True if cmap: ax = sns.heatmap(data, annot=annotate, fmt='.0f', square=True, cbar=cb, linewidths=0.5, cmap=cmap) else: ax = sns.heatmap(data, annot=annotate, fmt='.0f', square=True, cbar=cb, linewidths=0.5) ax.invert_yaxis() _ = [ytick.set_rotation(0) for ytick in ax.get_yticklabels()] ax.set_title(title) return ax def plot_dataset_profiles_calendar(self, dataset_id, **heatmap_kwargs): """Plot the heatmap profiles/day calendar for the specified dataset""" if dataset_id not in self.dataset_ids: self._logger.error('Dataset id {:} not found in {:}'.format( dataset_id, self.__repr__())) return profiles = self.get_dataset_profiles(dataset_id) if profiles.empty: self._logger.warning( 'No profiles found for dataset: {:}'.format(dataset_id)) return pgroup = profiles.latitude.groupby( [lambda x: x.year, lambda x: x.month, lambda x: x.day]).count() calendar = pgroup.unstack() annotate = True square = True cbar = False annot_kws = {'fontsize': 10} annot_kws = {} fig = plt.figure(figsize=(11, 8.5)) ax = sns.heatmap(calendar, annot=annotate, fmt='.0f', square=square, cbar=cbar, linewidths=0.5, annot_kws=annot_kws) # Format default y-tick labels to 'mmm YYYY' ylabels = [y.get_text() for y in ax.get_yticklabels()] new_ylabels = [] for ylabel in ylabels: y, m = ylabel.split('-') new_ylabels.append('{:} {:}'.format(self._months[int(m) - 1][0:3], y)) ax.set_yticklabels(new_ylabels) ax.set_ylabel('') ax.invert_yaxis() _ = [ytick.set_rotation(0) for ytick in ax.get_yticklabels()] ax.set_title('Profiles: {:}'.format(dataset_id)) return ax @staticmethod def encode_url(data_url): """Percent encode special url characters.""" url_pieces = list(urlsplit(data_url)) url_pieces[3] = quote(url_pieces[3]) return urlunsplit(url_pieces) def __repr__(self): return "<GdacClient(server='{:}', response='{:}', num_datasets={:})>".format( self._client.server, self._client.response, len(self._datasets_info))
class ErddapReader: def __init__(self, known_server='ioos', protocol=None, server=None, parallel=True): # # run checks for KW # self.kw = kw self.parallel = parallel # either select a known server or input protocol and server string if known_server == 'ioos': protocol = 'tabledap' server = 'http://erddap.sensors.ioos.us/erddap' elif known_server == 'coastwatch': protocol = 'griddap' server = 'http://coastwatch.pfeg.noaa.gov/erddap' elif known_server is not None: statement = 'either select a known server or input protocol and server string' assert (protocol is not None) & (server is not None), statement else: known_server = server.strip('/erddap').strip('http://').replace('.','_') statement = 'either select a known server or input protocol and server string' assert (protocol is not None) & (server is not None), statement self.known_server = known_server self.e = ERDDAP(server=server) self.e.protocol = protocol self.e.server = server # columns for metadata self.columns = ['geospatial_lat_min', 'geospatial_lat_max', 'geospatial_lon_min', 'geospatial_lon_max', 'time_coverage_start', 'time_coverage_end', 'defaultDataQuery', 'subsetVariables', # first works for timeseries sensors, 2nd for gliders 'keywords', # for hf radar 'id', 'infoUrl', 'institution', 'featureType', 'source', 'sourceUrl'] # name self.name = f'erddap_{known_server}' self.reader = 'ErddapReader' # # self.data_type = data_type # self.standard_names = standard_names # # DOESN'T CURRENTLY LIMIT WHICH VARIABLES WILL BE FOUND ON EACH SERVER @property def dataset_ids(self): '''Find dataset_ids for server.''' if not hasattr(self, '_dataset_ids'): # This should be a region search if self.approach == 'region': # find all the dataset ids which we will use to get the data # This limits the search to our keyword arguments in kw which should # have min/max lon/lat/time values dataset_ids = [] if self.variables is not None: for variable in self.variables: # find and save all dataset_ids associated with variable search_url = self.e.get_search_url(response="csv", **self.kw, variableName=variable, items_per_page=10000) try: search = pd.read_csv(search_url) dataset_ids.extend(search["Dataset ID"]) except Exception as e: logger_erd.exception(e) logger_erd.warning(f"variable {variable} was not found in the search") logger_erd.warning(f'search_url: {search_url}') else: # find and save all dataset_ids associated with variable search_url = self.e.get_search_url(response="csv", **self.kw, items_per_page=10000) try: search = pd.read_csv(search_url) dataset_ids.extend(search["Dataset ID"]) except Exception as e: logger_erd.exception(e) logger_erd.warning(f"nothing found in the search") logger_erd.warning(f'search_url: {search_url}') # only need a dataset id once since we will check them each for all standard_names self._dataset_ids = list(set(dataset_ids)) # This should be a search for the station names elif self.approach == 'stations': # elif self._stations is not None: # search by station name for each of stations dataset_ids = [] for station in self._stations: # if station has more than one word, AND will be put between to search for multiple # terms together url = self.e.get_search_url(response="csv", items_per_page=5, search_for=station) try: df = pd.read_csv(url) except Exception as e: logger_erd.exception(e) logger_erd.warning(f'search url {url} did not work for station {station}.') continue # first try for exact station match try: dataset_id = [dataset_id for dataset_id in df['Dataset ID'] if station.lower() in dataset_id.lower().split('_')][0] # if that doesn't work, trying for more general match and just take first returned option except Exception as e: logger_erd.exception(e) logger_erd.warning('When searching for a dataset id to match station name %s, the first attempt to match the id did not work.' % (station)) dataset_id = df.iloc[0]['Dataset ID'] # if 'tabs' in org_id: # don't split # axiom_id = [axiom_id for axiom_id in df['Dataset ID'] if org_id.lower() == axiom_id.lower()] # else: # axiom_id = [axiom_id for axiom_id in df['Dataset ID'] if org_id.lower() in axiom_id.lower().split('_')][0] # except: # dataset_id = None dataset_ids.append(dataset_id) self._dataset_ids = list(set(dataset_ids)) else: logger_erd.warning('Neither stations nor region approach were used in function dataset_ids.') return self._dataset_ids def meta_by_dataset(self, dataset_id): info_url = self.e.get_info_url(response="csv", dataset_id=dataset_id) info = pd.read_csv(info_url) items = [] for col in self.columns: try: item = info[info['Attribute Name'] == col]['Value'].values[0] dtype = info[info['Attribute Name'] == col]['Data Type'].values[0] except: if col == 'featureType': # this column is not present in HF Radar metadata but want it to # map to data_type, so input 'grid' in that case. item = 'grid' else: item = 'NA' if dtype == 'String': pass elif dtype == 'double': item = float(item) elif dtype == 'int': item = int(item) items.append(item) # if self.standard_names is not None: # # In case the variable is named differently from the standard names, # # we back out the variable names here for each dataset. This also only # # returns those names for which there is data in the dataset. # varnames = self.e.get_var_by_attr( # dataset_id=dataset_id, # standard_name=lambda v: v in self.standard_names # ) # else: # varnames = None ## include download link ## self.e.dataset_id = dataset_id if self.e.protocol == 'tabledap': if self.variables is not None: self.e.variables = ["time","longitude", "latitude", "station"] + self.variables # set the same time restraints as before self.e.constraints = {'time<=': self.kw['max_time'], 'time>=': self.kw['min_time'],} download_url = self.e.get_download_url(response='csvp') elif self.e.protocol == 'griddap': # the search terms that can be input for tabledap do not work for griddap # in erddapy currently. Instead, put together an opendap link and then # narrow the dataset with xarray. # get opendap link download_url = self.e.get_download_url(response='opendap') # add erddap server name return {dataset_id: [self.e.server, download_url] + items + [self.variables]} @property def meta(self): if not hasattr(self, '_meta'): if self.parallel: # get metadata for datasets # run in parallel to save time num_cores = multiprocessing.cpu_count() downloads = Parallel(n_jobs=num_cores)( delayed(self.meta_by_dataset)(dataset_id) for dataset_id in self.dataset_ids ) else: downloads = [] for dataset_id in self.dataset_ids: downloads.append(self.meta_by_dataset(dataset_id)) # make dict from individual dicts from collections import ChainMap meta = dict(ChainMap(*downloads)) # Make dataframe of metadata # variable names are the column names for the dataframe self._meta = pd.DataFrame.from_dict(meta, orient='index', columns=['database','download_url'] \ + self.columns + ['variable names']) return self._meta def data_by_dataset(self, dataset_id): download_url = self.meta.loc[dataset_id, 'download_url'] # data variables in ds that are not the variables we searched for # varnames = self.meta.loc[dataset_id, 'variable names'] if self.e.protocol == 'tabledap': try: # fetch metadata if not already present # found download_url from metadata and use dd = pd.read_csv(download_url, index_col=0, parse_dates=True) # Drop cols and rows that are only NaNs. dd = dd.dropna(axis='index', how='all').dropna(axis='columns', how='all') if self.variables is not None: # check to see if there is any actual data # this is a bit convoluted because the column names are the variable names # plus units so can't match 1 to 1. datacols = 0 # number of columns that represent data instead of metadata for col in dd.columns: datacols += [varname in col for varname in self.variables].count(True) # if no datacols, we can skip this one. if datacols == 0: dd = None except Exception as e: logger_erd.exception(e) logger_erd.warning('no data to be read in for %s' % dataset_id) dd = None elif self.e.protocol == 'griddap': try: dd = xr.open_dataset(download_url, chunks='auto').sel(time=slice(self.kw['min_time'],self.kw['max_time'])) if ('min_lat' in self.kw) and ('max_lat' in self.kw): dd = dd.sel(latitude=slice(self.kw['min_lat'],self.kw['max_lat'])) if ('min_lon' in self.kw) and ('max_lon' in self.kw): dd = dd.sel(longitude=slice(self.kw['min_lon'],self.kw['max_lon'])) # use variable names to drop other variables (should. Ido this?) if self.variables is not None: l = set(dd.data_vars) - set(self.variables) dd = dd.drop_vars(l) except Exception as e: logger_erd.exception(e) logger_erd.warning('no data to be read in for %s' % dataset_id) dd = None return (dataset_id, dd) @property def data(self): if not hasattr(self, '_data'): if self.parallel: num_cores = multiprocessing.cpu_count() downloads = Parallel(n_jobs=num_cores)( delayed(self.data_by_dataset)(dataset_id) for dataset_id in self.dataset_ids ) else: downloads = [] for dataset_id in self.dataset_ids: downloads.append(self.data_by_dataset(dataset_id)) # if downloads is not None: dds = {dataset_id: dd for (dataset_id, dd) in downloads} # else: # dds = None self._data = dds return self._data def count(self,url): try: return len(pd.read_csv(url)) except: return np.nan def all_variables(self): '''Return a list of all possible variables.''' file_name_counts = f'erddap_variable_list_{self.known_server}.csv' if os.path.exists(file_name_counts): return pd.read_csv(file_name_counts, index_col='variable') else: # This took 10 min running in parallel for ioos # 2 min for coastwatch url = f'{self.e.server}/categorize/variableName/index.csv?page=1&itemsPerPage=100000' df = pd.read_csv(url) # counts = [] # for url in df.URL: # counts.append(self.count(url)) num_cores = multiprocessing.cpu_count() counts = Parallel(n_jobs=num_cores)( delayed(self.count)(url) for url in df.URL ) dfnew = pd.DataFrame() dfnew['variable'] = df['Category'] dfnew['count'] = counts dfnew = dfnew.set_index('variable') # remove nans if (dfnew.isnull().sum() > 0).values: dfnew = dfnew[~dfnew.isnull().values].astype(int) dfnew.to_csv(file_name_counts) return dfnew def search_variables(self, variables): '''Find valid variables names to use. Call with `search_variables()` to return the list of possible names. Call with `search_variables('salinity')` to return relevant names. ''' if not isinstance(variables, list): variables = [variables] # set up search for input variables search = f"(?i)" for variable in variables: search += f".*{variable}|" search = search.strip('|') r = re.compile(search) # just get the variable names df = self.all_variables() parameters = df.index matches = list(filter(r.match, parameters)) # return parameters that match input variable strings return df.loc[matches].sort_values('count', ascending=False) def check_variables(self, variables, verbose=False): if not isinstance(variables, list): variables = [variables] # parameters = list(self.all_variables().keys()) parameters = list(self.all_variables().index) # for a variable to exactly match a parameter # this should equal 1 count = [] for variable in variables: count += [parameters.count(variable)] condition = np.allclose(count,1) assertion = f'The input variables are not exact matches to ok variables for known_server {self.known_server}. \ \nCheck all parameter group values with `ErddapReader().all_variables()` \ \nor search parameter group values with `ErddapReader().search_variables({variables})`.\ \n\n Try some of the following variables:\n{str(self.search_variables(variables))}'# \ # \nor run `ErddapReader().check_variables("{variables}")' assert condition, assertion if condition and verbose: print('all variables are matches!')
#%% #tend = datetime(2019, 7, 27, 0, 0) #tini = datetime(2019, 7, 28, 0, 0) tini = datetime(2019, 9, 14, 0, 0) tend = datetime(2019, 9, 15, 0, 0) #%% Look for datasets in IOOS glider dac print('Looking for glider data sets') e = ERDDAP(server = url_glider) # Grab every dataset available datasets = pd.read_csv(e.get_search_url(response='csv', search_for='all')) # Search constraints kw = { 'min_lon': lon_lim[0], 'max_lon': lon_lim[1], 'min_lat': lat_lim[0], 'max_lat': lat_lim[1], 'min_time': tini.strftime('%Y-%m-%dT%H:%M:%SZ'), 'max_time': tend.strftime('%Y-%m-%dT%H:%M:%SZ'), } search_url = e.get_search_url(response='csv', **kw) #print(search_url) # Grab the results
def GOFS_RTOFS_vs_Argo_floats(lon_forec_track, lat_forec_track, lon_forec_cone, lat_forec_cone, lon_best_track, lat_best_track, lon_lim, lat_lim, folder_fig): #%% User input #GOFS3.1 output model location url_GOFS_ts = 'http://tds.hycom.org/thredds/dodsC/GLBy0.08/expt_93.0/ts3z' # RTOFS files folder_RTOFS = '/home/coolgroup/RTOFS/forecasts/domains/hurricanes/RTOFS_6hourly_North_Atlantic/' nc_files_RTOFS = ['rtofs_glo_3dz_f006_6hrly_hvr_US_east.nc',\ 'rtofs_glo_3dz_f012_6hrly_hvr_US_east.nc',\ 'rtofs_glo_3dz_f018_6hrly_hvr_US_east.nc',\ 'rtofs_glo_3dz_f024_6hrly_hvr_US_east.nc'] # COPERNICUS MARINE ENVIRONMENT MONITORING SERVICE (CMEMS) url_cmems = 'http://nrt.cmems-du.eu/motu-web/Motu' service_id = 'GLOBAL_ANALYSIS_FORECAST_PHY_001_024-TDS' product_id = 'global-analysis-forecast-phy-001-024' depth_min = '0.493' out_dir = '/home/aristizabal/crontab_jobs' # Bathymetry file #bath_file = '/Users/aristizabal/Desktop/MARACOOS_project/Maria_scripts/nc_files/GEBCO_2014_2D_-100.0_0.0_-60.0_45.0.nc' bath_file = '/home/aristizabal/bathymetry_files/GEBCO_2014_2D_-100.0_0.0_-10.0_50.0.nc' # Argo floats url_Argo = 'http://www.ifremer.fr/erddap' #%% from matplotlib import pyplot as plt import numpy as np import xarray as xr import netCDF4 from datetime import datetime, timedelta import cmocean import matplotlib.dates as mdates from erddapy import ERDDAP import pandas as pd import os # Do not produce figures on screen plt.switch_backend('agg') # Increase fontsize of labels globally plt.rc('xtick', labelsize=14) plt.rc('ytick', labelsize=14) plt.rc('legend', fontsize=14) #%% Reading bathymetry data ncbath = xr.open_dataset(bath_file) bath_lat = ncbath.variables['lat'][:] bath_lon = ncbath.variables['lon'][:] bath_elev = ncbath.variables['elevation'][:] oklatbath = np.logical_and(bath_lat >= lat_lim[0], bath_lat <= lat_lim[-1]) oklonbath = np.logical_and(bath_lon >= lon_lim[0], bath_lon <= lon_lim[-1]) bath_latsub = bath_lat[oklatbath] bath_lonsub = bath_lon[oklonbath] bath_elevs = bath_elev[oklatbath, :] bath_elevsub = bath_elevs[:, oklonbath] #%% Get time bounds for current day #ti = datetime.today() ti = datetime.today() - timedelta(1) - timedelta(hours=6) tini = datetime(ti.year, ti.month, ti.day) te = ti + timedelta(2) tend = datetime(te.year, te.month, te.day) #%% Look for Argo datasets e = ERDDAP(server=url_Argo) # Grab every dataset available #datasets = pd.read_csv(e.get_search_url(response='csv', search_for='all')) kw = { 'min_lon': lon_lim[0], 'max_lon': lon_lim[1], 'min_lat': lat_lim[0], 'max_lat': lat_lim[1], 'min_time': str(tini), 'max_time': str(tend), } search_url = e.get_search_url(response='csv', **kw) # Grab the results search = pd.read_csv(search_url) # Extract the IDs dataset = search['Dataset ID'].values msg = 'Found {} Datasets:\n\n{}'.format print(msg(len(dataset), '\n'.join(dataset))) dataset_type = dataset[0] constraints = { 'time>=': str(tini), 'time<=': str(tend), 'latitude>=': lat_lim[0], 'latitude<=': lat_lim[1], 'longitude>=': lon_lim[0], 'longitude<=': lon_lim[1], } variables = [ 'platform_number', 'time', 'pres', 'longitude', 'latitude', 'temp', 'psal', ] e = ERDDAP(server=url_Argo, protocol='tabledap', response='nc') e.dataset_id = dataset_type e.constraints = constraints e.variables = variables print(e.get_download_url()) df = e.to_pandas( parse_dates=True, skiprows=(1, ) # units information can be dropped. ).dropna() argo_ids = np.asarray(df['platform_number']) argo_times = np.asarray(df['time (UTC)']) argo_press = np.asarray(df['pres (decibar)']) argo_lons = np.asarray(df['longitude (degrees_east)']) argo_lats = np.asarray(df['latitude (degrees_north)']) argo_temps = np.asarray(df['temp (degree_Celsius)']) argo_salts = np.asarray(df['psal (PSU)']) #%% GOGF 3.1 try: GOFS_ts = xr.open_dataset(url_GOFS_ts, decode_times=False) lt_GOFS = np.asarray(GOFS_ts['lat'][:]) ln_GOFS = np.asarray(GOFS_ts['lon'][:]) tt = GOFS_ts['time'] t_GOFS = netCDF4.num2date(tt[:], tt.units) depth_GOFS = np.asarray(GOFS_ts['depth'][:]) except Exception as err: print(err) GOFS_ts = np.nan lt_GOFS = np.nan ln_GOFS = np.nan depth_GOFS = np.nan t_GOFS = ti #%% Map Argo floats lev = np.arange(-9000, 9100, 100) plt.figure() plt.contourf(bath_lonsub, bath_latsub, bath_elevsub, lev, cmap=cmocean.cm.topo) plt.plot(lon_forec_track, lat_forec_track, '.-', color='gold') plt.plot(lon_forec_cone, lat_forec_cone, '.-b', markersize=1) plt.plot(lon_best_track, lat_best_track, 'or', markersize=3) argo_idd = np.unique(argo_ids) for i, id in enumerate(argo_idd): okind = np.where(argo_ids == id)[0] plt.plot(np.unique(argo_lons[okind]), np.unique(argo_lats[okind]), 's', color='darkorange', markersize=5, markeredgecolor='k') plt.title('Argo Floats ' + str(tini)[0:13] + '-' + str(tend)[0:13], fontsize=16) plt.axis('scaled') plt.xlim(lon_lim[0], lon_lim[1]) plt.ylim(lat_lim[0], lat_lim[1]) file = folder_fig + 'ARGO_lat_lon' #file = folder_fig + 'ARGO_lat_lon_' + str(np.unique(argo_times)[0])[0:10] plt.savefig(file, bbox_inches='tight', pad_inches=0.1) #%% Figure argo float vs GOFS and vs RTOFS argo_idd = np.unique(argo_ids) for i, id in enumerate(argo_idd): print(id) okind = np.where(argo_ids == id)[0] argo_time = np.asarray([ datetime.strptime(t, '%Y-%m-%dT%H:%M:%SZ') for t in argo_times[okind] ]) argo_lon = argo_lons[okind] argo_lat = argo_lats[okind] argo_pres = argo_press[okind] argo_temp = argo_temps[okind] argo_salt = argo_salts[okind] # GOFS print('Retrieving variables from GOFS') if isinstance(GOFS_ts, float): temp_GOFS = np.nan salt_GOFS = np.nan else: #oktt_GOFS = np.where(t_GOFS >= argo_time[0])[0][0] ttGOFS = np.asarray([ datetime(t_GOFS[i].year, t_GOFS[i].month, t_GOFS[i].day, t_GOFS[i].hour) for i in np.arange(len(t_GOFS)) ]) tstamp_GOFS = [ mdates.date2num(ttGOFS[i]) for i in np.arange(len(ttGOFS)) ] oktt_GOFS = np.unique( np.round( np.interp(mdates.date2num(argo_time[0]), tstamp_GOFS, np.arange(len(tstamp_GOFS)))).astype(int))[0] oklat_GOFS = np.where(lt_GOFS >= argo_lat[0])[0][0] oklon_GOFS = np.where(ln_GOFS >= argo_lon[0] + 360)[0][0] temp_GOFS = np.asarray(GOFS_ts['water_temp'][oktt_GOFS, :, oklat_GOFS, oklon_GOFS]) salt_GOFS = np.asarray(GOFS_ts['salinity'][oktt_GOFS, :, oklat_GOFS, oklon_GOFS]) # RTOFS #Time window year = int(argo_time[0].year) month = int(argo_time[0].month) day = int(argo_time[0].day) tini = datetime(year, month, day) tend = tini + timedelta(days=1) # Read RTOFS grid and time print('Retrieving coordinates from RTOFS') if tini.month < 10: if tini.day < 10: fol = 'rtofs.' + str(tini.year) + '0' + str( tini.month) + '0' + str(tini.day) else: fol = 'rtofs.' + str(tini.year) + '0' + str(tini.month) + str( tini.day) else: if tini.day < 10: fol = 'rtofs.' + str(tini.year) + str(tini.month) + '0' + str( tini.day) else: fol = 'rtofs.' + str(tini.year) + str(tini.month) + str( tini.day) ncRTOFS = xr.open_dataset(folder_RTOFS + fol + '/' + nc_files_RTOFS[0]) latRTOFS = np.asarray(ncRTOFS.Latitude[:]) lonRTOFS = np.asarray(ncRTOFS.Longitude[:]) depth_RTOFS = np.asarray(ncRTOFS.Depth[:]) tRTOFS = [] for t in np.arange(len(nc_files_RTOFS)): ncRTOFS = xr.open_dataset(folder_RTOFS + fol + '/' + nc_files_RTOFS[t]) tRTOFS.append(np.asarray(ncRTOFS.MT[:])[0]) tRTOFS = np.asarray([mdates.num2date(mdates.date2num(tRTOFS[t])) \ for t in np.arange(len(nc_files_RTOFS))]) oktt_RTOFS = np.where( mdates.date2num(tRTOFS) >= mdates.date2num(argo_time[0]))[0][0] oklat_RTOFS = np.where(latRTOFS[:, 0] >= argo_lat[0])[0][0] oklon_RTOFS = np.where(lonRTOFS[0, :] >= argo_lon[0])[0][0] nc_file = folder_RTOFS + fol + '/' + nc_files_RTOFS[oktt_RTOFS] ncRTOFS = xr.open_dataset(nc_file) #time_RTOFS = tRTOFS[oktt_RTOFS] temp_RTOFS = np.asarray(ncRTOFS.variables['temperature'][0, :, oklat_RTOFS, oklon_RTOFS]) salt_RTOFS = np.asarray(ncRTOFS.variables['salinity'][0, :, oklat_RTOFS, oklon_RTOFS]) #lon_RTOFS = lonRTOFS[0,oklon_RTOFS] #lat_RTOFS = latRTOFS[oklat_RTOFS,0] # Downloading and reading Copernicus output motuc = 'python -m motuclient --motu ' + url_cmems + \ ' --service-id ' + service_id + \ ' --product-id ' + product_id + \ ' --longitude-min ' + str(argo_lon[0]-2/12) + \ ' --longitude-max ' + str(argo_lon[0]+2/12) + \ ' --latitude-min ' + str(argo_lat[0]-2/12) + \ ' --latitude-max ' + str(argo_lat[0]+2/12) + \ ' --date-min ' + '"' + str(tini-timedelta(0.5)) + '"' + \ ' --date-max ' + '"' + str(tend+timedelta(0.5)) + '"' + \ ' --depth-min ' + depth_min + \ ' --depth-max ' + str(np.nanmax(argo_pres)+1000) + \ ' --variable ' + 'thetao' + ' ' + \ ' --variable ' + 'so' + ' ' + \ ' --out-dir ' + out_dir + \ ' --out-name ' + str(id) + '.nc' + ' ' + \ ' --user ' + 'maristizabalvar' + ' ' + \ ' --pwd ' + 'MariaCMEMS2018' os.system(motuc) # Check if file was downloaded COP_file = out_dir + '/' + str(id) + '.nc' # Check if file was downloaded resp = os.system('ls ' + out_dir + '/' + str(id) + '.nc') if resp == 0: COP = xr.open_dataset(COP_file) latCOP = np.asarray(COP.latitude[:]) lonCOP = np.asarray(COP.longitude[:]) depth_COP = np.asarray(COP.depth[:]) tCOP = np.asarray(mdates.num2date(mdates.date2num(COP.time[:]))) else: latCOP = np.empty(1) latCOP[:] = np.nan lonCOP = np.empty(1) lonCOP[:] = np.nan tCOP = np.empty(1) tCOP[:] = np.nan oktimeCOP = np.where( mdates.date2num(tCOP) >= mdates.date2num(tini))[0][0] oklonCOP = np.where(lonCOP >= argo_lon[0])[0][0] oklatCOP = np.where(latCOP >= argo_lat[0])[0][0] temp_COP = np.asarray(COP.variables['thetao'][oktimeCOP, :, oklatCOP, oklonCOP]) salt_COP = np.asarray(COP.variables['so'][oktimeCOP, :, oklatCOP, oklonCOP]) # Figure temp plt.figure(figsize=(5, 6)) plt.plot(argo_temp, -argo_pres, '.-', linewidth=2, label='ARGO Float id ' + str(id)) plt.plot(temp_GOFS, -depth_GOFS, '.-', linewidth=2, label='GOFS 3.1', color='red') plt.plot(temp_RTOFS, -depth_RTOFS, '.-', linewidth=2, label='RTOFS', color='g') plt.plot(temp_COP, -depth_COP, '.-', linewidth=2, label='Copernicus', color='darkorchid') plt.ylim([-1000, 0]) plt.title('Temperature Profile on '+ str(argo_time[0])[0:13] + '\n [lon,lat] = [' \ + str(np.round(argo_lon[0],3)) +',' +\ str(np.round(argo_lat[0],3))+']',\ fontsize=16) plt.ylabel('Depth (m)', fontsize=14) plt.xlabel('$^oC$', fontsize=14) plt.legend(loc='lower right', fontsize=14) file = folder_fig + 'ARGO_vs_GOFS_RTOFS_COP_temp_' + str(id) plt.savefig(file, bbox_inches='tight', pad_inches=0.1) # Figure salt plt.figure(figsize=(5, 6)) plt.plot(argo_salt, -argo_pres, '.-', linewidth=2, label='ARGO Float id ' + str(id)) plt.plot(salt_GOFS, -depth_GOFS, '.-', linewidth=2, label='GOFS 3.1', color='red') plt.plot(salt_RTOFS, -depth_RTOFS, '.-', linewidth=2, label='RTOFS', color='g') plt.plot(salt_COP, -depth_COP, '.-', linewidth=2, label='Copernicus', color='darkorchid') plt.ylim([-1000, 0]) plt.title('Salinity Profile on '+ str(argo_time[0])[0:13] + '\n [lon,lat] = [' \ + str(np.round(argo_lon[0],3)) +',' +\ str(np.round(argo_lat[0],3))+']',\ fontsize=16) plt.ylabel('Depth (m)', fontsize=14) plt.legend(loc='lower right', fontsize=14) file = folder_fig + 'ARGO_vs_GOFS_RTOFS_COP_salt_' + str(id) plt.savefig(file, bbox_inches='tight', pad_inches=0.1)