示例#1
0
def query(url, **kw):
    df = pd.DataFrame()

    # we need to rstrip to prevent a '//' in the URL for some reason:
    url = url.rstrip("/")
    e = ERDDAP(server=url, protocol='tabledap', response='csv')

    # submit the query:
    try:
        # this is redundant to ERDDAPY API query below:
        #r = requests.get(e.get_search_url(**kw), headers=headers)
        #r.raise_for_status()
        print("Testing ERDDAP {}".format(url))
        df = pd.read_csv("{}".format(e.get_search_url(**kw), headers=headers))
        print("ERDDAP {} returned results from URL: {}".format(
            url, e.get_search_url(**kw)))
        df['server'] = url
        df.dropna(subset=['tabledap'], inplace=True)

        return df[[
            'server', 'Dataset ID', 'tabledap', 'Institution', 'Summary'
        ]]
    except Exception as ex:
        # can happen if the dataset does not have any features within the query window, just log it here:
        if type(ex).__name__ in ["HTTPError"]:
            print(ex)
            #raise
        pass
    return None
示例#2
0
class DatasetList:
    """Search servers for glider dataset ids. Defaults to the string "glider"


    Attributes:
        e: an ERDDAP server instance
        search_terms: A list of terms to search the server for. Multiple terms will be combined as AND

    """
    def __init__(self, server=_server):
        self.e = ERDDAP(
            server=server,
            protocol="tabledap",
        )

    @functools.lru_cache(maxsize=None)
    def _get_ids(self, search_terms):
        """Thin wrapper where inputs can be hashed for lru_cache."""
        dataset_ids = pd.Series(dtype=str)
        for term in search_terms:
            url = self.e.get_search_url(search_for=term, response="csv")

            dataset_ids = dataset_ids.append(pd.read_csv(url)["Dataset ID"],
                                             ignore_index=True)
        self.dataset_ids = dataset_ids.str.split(";",
                                                 expand=True).stack().unique()

        return self.dataset_ids

    def get_ids(self, search_terms=["glider"]):
        """Search the database using a user supplied list of comma separated strings
        :return: Unique list of dataset ids
        """
        search_terms = tuple(search_terms)
        return self._get_ids(search_terms)
def retrieve_dataset_id_erddap_server(url_erddap, lat_lim, lon_lim, date_ini,
                                      date_end):
    """
    Created on Tue Feb  5 10:05:37 2019

    @author: aristizabal

    This function retrieves glider ids from the IOOS
    Data Assembly Center (DAC).

    Inputs:
    url_erddap: url address of erddap server
                Example: 'https://data.ioos.us/gliders/erddap'
    lat_lim: latitude limits for the search.
            Example, lat_lim = [38.0,40.0]
    lon_lim: longitude limits for the search.
            Example, lon_lim = [-75.0,-72.0]
    date_ini: initial date of time window.
        This function accepts the data formats '%Y-%m-%d T %H:%M:%S Z' and '%Y/%m/%d/%H'.
        Examaple: date_ini = '2018-08-02T00:00:00Z'
    date_end: initial date of time window.
        This function accepts the data formats '%Y-%m-%d T %H:%M:%S Z' and '%Y/%m/%d/%H'.
        Examaple: date_ini = '2018-08-10T00:00:00Z'

    Outputs:
    gliders: list of gliders ids that fall within the lat, lon and
             time constraints
    """

    from erddapy import ERDDAP
    import pandas as pd

    e = ERDDAP(server=url_erddap)

    # Search constraints
    kw = {
        'min_lon': lon_lim[0],
        'max_lon': lon_lim[1],
        'min_lat': lat_lim[0],
        'max_lat': lat_lim[1],
        'min_time': date_ini,
        'max_time': date_end,
    }

    search_url = e.get_search_url(response='csv', **kw)
    search = pd.read_csv(search_url)

    # Extract the IDs
    gliders = search['Dataset ID'].values

    return gliders
示例#4
0
def return_glider_ids(kwargs):
    """
    Searches an ERDDAP server for datasets and returns dataset IDs
    :param kwargs: dictionary containing coordinate and time limits
    :return: array containing dataset IDs
    """
    e = ERDDAP(server=ioos_url)
    search_url = e.get_search_url(response='csv', **kwargs)
    try:
        search = pd.read_csv(search_url)
        ds_ids = search['Dataset ID'].values
    except:
        ds_ids = np.array([])

    return ds_ids
示例#5
0
If we change the response to `html` we can visualize the page.

def show_iframe(src):
    from IPython.display import HTML

    iframe = '<iframe src="{src}" width="100%" height="950"></iframe>'.format
    return HTML(iframe(src=src))


show_iframe(e.get_download_url(response="html"))

Additionally, the object has `.get_info_url()` and `.get_search_url()` that can be used to obtain the info and search URLs respectively

show_iframe(e.get_info_url(response="html"))

show_iframe(e.get_search_url(response="html"))

`erddapy` also brings some simple methods to download the data in some common data formats, like `pandas.DataFrame` and `xarray.Dataset`.

df = e.to_pandas(index_col="time (UTC)", parse_dates=True,).dropna()


df.head()

ds = e.to_xarray(decode_times=False)

ds["temperature"]

Here is a simple plot using the data from `xarray`.

%matplotlib inline
示例#6
0
 def list_data(self, verbose=False):
     e = ERDDAP(server=self.server_url)
     self.df = pd.read_csv(e.get_search_url(response='csv', search_for=self.glider_id))
     if verbose:
         print(self.df['Dataset ID'])
示例#7
0
class ErddapPlotter(object):
    def __init__(self, erddap_url, protocol='tabledap', response='png'):

        self._img_types = [
            'smallPdf', 'pdf', 'largePdf', 'smallPng', 'png', 'largePng',
            'transparentPng'
        ]

        self._default_plot_parameters = {
            '.bgColor=': '0xFFFFFF',
            '.color=': '0x000000',
            '.colorBar=': 'Rainbow2|C|Linear|||',
            '.draw=': 'markers',
            '.legend=': 'Bottom',
            '.marker=': '6|5',
            '.xRange=': '||true|Linear',
            '.yRange=': '||false|Linear'
        }

        if response not in self._img_types:
            raise ValueError(
                'Invalid image response type specified: {:}'.format(response))

        self._erddap_url = erddap_url
        self._protocol = protocol
        self._response = response

        self._plot_query = ''
        self._constraints_query = ''
        self._image_url = ''
        self._last_request = ''

        self._logger = logging.getLogger(os.path.basename(__file__))

        self._e = ERDDAP(self._erddap_url,
                         protocol=self._protocol,
                         response=self._response)

        self._datasets = pd.DataFrame([])
        self.fetch_erddap_datasets()

        self._constraints = {}
        self._plot_parameters = self._default_plot_parameters.copy()

        # self._line_style = {}
        # self._marker_style = {}
        # self._marker_color = {}
        # self._colorbar = {}
        # self._y_range = {}
        # self._x_range = {}
        # self._bg_color = {}
        # self._legend = {}
        # self._zoom = {}

        # self.set_line_style()
        # self.set_marker_style()
        # self.set_marker_color()
        # self.set_colorbar()
        # self.set_y_range()
        # self.set_x_range()
        # self.set_bg_color()
        # self.set_legend_loc()

        self._legend_options = ['Bottom', 'Off', 'Only']

        self._line_styles = [
            'lines', 'linesAndMarkers', 'markers', 'sticks', 'vectors'
        ]

        self._marker_types = [
            'None', 'Plus', 'X', 'Dot', 'Square', 'Filled Square', 'Circle',
            'Filled Circle', 'Up Triangle', 'Filled Up Triangle'
        ]

        self._marker_color_codes = [
            'FFFFFF', 'CCCCCC', '999999', '666666', '000000', 'FF0000',
            'FF9900', 'FFFF00', '99FF00', '00FF00', '00FF99', '00FFFF',
            '0099FF', '0000FF', '9900FF', 'FF00FF', 'FF99FF'
        ]

        self._marker_colors = [
            'white', 'light grey', 'grey', 'dark grey', 'black', 'red',
            'orange', 'yellow', 'light green', 'green', 'blue green', 'cyan',
            'blue', 'dark blue', 'purple', 'pink', 'light pink'
        ]

        self._colors = dict(zip(self._marker_colors, self._marker_color_codes))

        self._continuous_options = ['C', 'D']

        self._scale_options = ['Linear', 'Log']

        self._colorbars = [
            'BlackBlueWhite', 'BlackGreenWhite', 'BlackRedWhite', 'BlackWhite',
            'BlueWhiteRed', 'BlueWideWhiteRed', 'LightRainbow', 'Ocean',
            'OceanDepth', 'Rainbow', 'Rainbow2', 'Rainfall', 'ReverseRainbow',
            'RedWhiteBlue', 'RedWhiteBlue2', 'RedWideWhiteBlue', 'Spectrum',
            'Topography', 'TopographyDepth', 'WhiteBlueBlack',
            'WhiteGreenBlack', 'WhiteRedBlack', 'WhiteBlack', 'YellowRed',
            'KT_algae', 'KT_amp', 'KT_balance', 'KT_curl', 'KT_deep',
            'KT_delta', 'KT_dense', 'KT_gray', 'KT_haline', 'KT_ice',
            'KT_matter', 'KT_oxy', 'KT_phase', 'KT_solar', 'KT_speed',
            'KT_tempo', 'KT_thermal', 'KT_turbid'
        ]

        self._zoom_levels = ['in', 'in2', 'in8', 'out', 'out2', 'out8']

        # Set default plotting parameters
        self.reset_plot_params()

    @property
    def client(self):
        return self._e

    @property
    def response(self):
        return self._e.response

    @response.setter
    def response(self, response_type):
        if response_type not in self._img_types:
            raise ValueError(
                'Invalid image response type specified: {:}'.format(
                    response_type))

        self._response = response_type
        self._e.response = response_type

    @property
    def datasets(self):
        return self._datasets

    @property
    def plot_parameters(self):

        return self._plot_parameters

    @property
    def constraints(self):

        return self._constraints

    @property
    def plot_query(self):

        self.build_plot_query_string()

        return self._plot_query

    @property
    def constraints_query(self):

        self.build_constraints_query_string()

        return self._constraints_query

    @property
    def last_request(self):
        return self._last_request

    @property
    def image_url(self):
        return self._image_url

    @property
    def colorbars(self):
        return self._colorbars

    def fetch_erddap_datasets(self):

        try:

            self._logger.info('Fetching available server datasets: {:}'.format(
                self._erddap_url))
            url = self._e.get_search_url(response='csv')
            self._last_request = url

            self._logger.debug('Server info: {:}'.format(self._last_request))
            self._datasets = pd.read_csv(url)

            # rename columns more friendly
            columns = {
                s: s.replace(' ', '_').lower()
                for s in self._datasets.columns
            }
            self._datasets.rename(columns=columns, inplace=True)

            # Use dataset_id as the index
            self._datasets.set_index('dataset_id', inplace=True)

        except requests.exceptions.HTTPError as e:
            self._logger.error(
                'Failed to fetch/parse ERDDAP server datasets info: {:} ({:})'.
                format(url, e))
            return

    def set_bg_color(self, color='white'):
        #   .bgColor:   value (0xAARRGGBB)
        if color not in self._colors:
            return

        self._plot_parameters.update(
            {'.bgColor=': '0x{:}'.format(self._colors[color])})

        # self._bg_color = {'.bgColor=': '0x{:}'.format(self._colors[color])}

    def set_colorbar(self,
                     colorbar='Rainbow2',
                     continuous=None,
                     scale=None,
                     min='',
                     max='',
                     num_sections=''):
        # .colorBar:  palette|continuous|scale|min|max|nSections

        continuous = continuous or self._continuous_options[0]
        scale = scale or self._scale_options[0]

        if colorbar not in self._colorbars:
            return

        if continuous not in self._continuous_options:
            return {}

        if scale not in self._scale_options:
            return {}

        self._plot_parameters.update({
            '.colorBar=':
            '{:}|{:}|{:}|{:}|{:}|{:}'.format(colorbar, continuous, scale, min,
                                             max, num_sections)
        })

        # self._colorbar = {'.colorBar=': '{:}|{:}|{:}|{:}|{:}|{:}'.format(colorbar,
        #                                                                  continuous,
        #                                                                  scale,
        #                                                                  min,
        #                                                                  max,
        #                                                                  num_sections)}
        #
        # self._update_plot_params()

    def set_marker_color(self, color='white'):
        #   .color:     value (0xAARRGGBB)
        if color not in self._colors:
            return {}

        self._plot_parameters.update(
            {'.color=': '0x{:}'.format(self._colors[color])})

        # self._marker_color = {'.color=': '0x{:}'.format(self._colors[color])}

        # self._update_plot_params()

    def set_line_style(self, line_style='markers'):
        # .draw:      value (lines|linesAndMarkers|markers|sticks|vectors)

        if line_style not in self._line_styles:
            return {}

        self._plot_parameters.update({'.draw=': line_style})

        # self._line_style = {'.draw=': line_style}

        # self._update_plot_params()

    def set_legend_loc(self, location='Bottom'):
        # .legend:    value (Bottom|Off|Only)

        if location not in self._legend_options:
            return {}

        self._plot_parameters.update({'.legend=': location})

        # self._legend = {'.legend=': location}

        # self._update_plot_params()

    def set_marker_style(self, marker='Circle', marker_size=5):
        # .marker:    markerType|markerSize

        if marker not in self._marker_types:
            return {}

        self._plot_parameters.update({
            '.marker=':
            '{:}|{:}'.format(self._marker_types.index(marker), marker_size)
        })

        # self._marker_style = {'.marker=': '{:}|{:}'.format(self._marker_types.index(marker), marker_size)}

        # self._update_plot_params()

    def set_x_range(self, min_val='', max_val='', ascending=True, scale=None):
        #   .xRange:    min|max|ascending|scale

        scale = scale or self._scale_options[0]

        if scale not in self._scale_options:
            return {}

        self._plot_parameters.update({
            '.xRange=':
            '{:}|{:}|{:}|{:}'.format(min_val, max_val,
                                     str(ascending).lower(), scale)
        })

        # self._x_range = {'.xRange=': '{:}|{:}|{:}|{:}'.format(min_val, max_val, str(ascending).lower(), scale)}

        # self._update_plot_params()

    def set_y_range(self, min_val='', max_val='', ascending=False, scale=None):
        #   .yRange:    min|max|ascending|scale

        scale = scale or self._scale_options[0]

        if scale not in self._scale_options:
            return {}

        self._plot_parameters.update({
            '.yRange=':
            '{:}|{:}|{:}|{:}'.format(min_val, max_val,
                                     str(ascending).lower(), scale)
        })

        # self._y_range = {'.yRange=': '{:}|{:}|{:}|{:}'.format(min_val, max_val, str(ascending).lower(), scale)}

        # self._update_plot_params()

    def set_zoom(self, zoom_level='in'):

        if zoom_level not in self._zoom_levels:
            return {}

        self._plot_parameters.update({'.zoom=': zoom_level})

        # self._zoom_levels = {'.zoom=': zoom_level}

        # self._update_plot_params()

    def set_trim_pixels(self, num_pixels=10):

        self._plot_parameters.update({'.trim=': str(num_pixels)})

    # def _update_plot_params(self):
    #
    #     self._plot_parameters.update(self._line_style)
    #     self._plot_parameters.update(self._marker_style)
    #     self._plot_parameters.update(self._marker_color)
    #     self._plot_parameters.update(self._colorbar)
    #     self._plot_parameters.update(self._y_range)
    #     self._plot_parameters.update(self._x_range)
    #     self._plot_parameters.update(self._bg_color)
    #     self._plot_parameters.update(self._legend)
    #     self._plot_parameters.update(self._zoom)
    #
    #     self.build_plot_query_string()

    def add_constraint(self, constraint, constraint_value):

        self._constraints[constraint] = constraint_value

    def remove_constraint(self, constraint):

        if not constraint.endswith('='):
            constraint = '{:}='.format(constraint)

        self._constraints.pop(constraint, None)

    def remove_plot_parameter(self, plot_parameter):

        if not plot_parameter.endswith('='):
            plot_parameter = '{:}='.format(plot_parameter)

        self._plot_parameters.pop(plot_parameter, None)

    def reset_plot_params(self):

        self._plot_parameters = self._default_plot_parameters.copy()

        # self._line_style = {}
        # self._marker_style = {}
        # self._marker_color = {}
        # self._colorbar = {}
        # self._y_range = {}
        # self._x_range = {}
        # self._bg_color = {}
        # self._legend = {}

        # Set default plotting parameters
        # self.set_line_style()
        # self.set_marker_style()
        # self.set_marker_color()
        # self.set_colorbar()
        # self.set_y_range()
        # self.set_x_range()
        # self.set_bg_color()
        # self.set_legend_loc()
        # self.set_zoom()
        #
        # self.build_plot_query_string()

    def build_plot_query_string(self):

        self._plot_query = '&'.join([
            '{:}{:}'.format(k, quote(v))
            for k, v in self._plot_parameters.items()
        ])

    def build_constraints_query_string(self):

        self._constraints_query = '&'.join([
            '{:}{:}'.format(k, quote(v)) for k, v in self._constraints.items()
        ])

    def build_image_request(self, dataset_id, x, y, c=None):

        if dataset_id not in self._datasets.index:
            self._logger.error(
                'Dataset ID {:} does not exist'.format(dataset_id))
            return

        variables = [x, y]
        if c:
            variables.append(c)

        self.build_plot_query_string()
        self.build_constraints_query_string()

        if self._constraints:
            url = '{:}/{:}/{:}.{:}?{:}&{:}&{:}'.format(
                self._e.server, self._e.protocol, dataset_id, self._response,
                ','.join(variables), self._constraints_query, self._plot_query)
        else:
            url = '{:}/{:}/{:}.{:}?{:}&{:}'.format(self._e.server,
                                                   self._e.protocol,
                                                   dataset_id, self._response,
                                                   ','.join(variables),
                                                   self._plot_query)

        self._image_url = url

        return self._image_url

    def download_image(self, image_url, image_path):

        image_dir = os.path.dirname(image_path)
        if not os.path.isdir(image_dir):
            self._logger.error(
                'Invalid image destination specified: {:}'.format(image_dir))
            return

        self._logger.debug('Image url: {:}'.format(image_url))

        self._logger.info('Fetching and writing image: {:}'.format(image_path))
        r = requests.get(image_url, stream=True)
        if r.status_code != 200:
            self._logger.error('{:} (code={:}'.format(r.reason, r.status_code))
            return
        with open(image_path, 'wb') as f:
            for chunk in r.iter_content():
                f.write(chunk)

            return image_path

    def __repr__(self):
        return '<ErddapPlotter(server={:}, response={:}, num_datasets={:})>'.format(
            self._e.server, self._e.response, len(self._datasets))
# ERDDAP Access: OOI-Net

from erddapy import ERDDAP


def to_df(url):
    import pandas as pd
    return pd.read_csv(url)


erd = ERDDAP(
    server='https://erddap-uncabled.oceanobservatories.org/uncabled/erddap',
    protocol='tabledap',
)

url = erd.get_search_url(search_for='CP01CNSM ctdbp', response='csv')
url

datasets = to_df(url)['Dataset ID']
datasets

# Get a specific dataset info:

datasets[7]

# ### OMS++ Data Availability
#

from erddapy import ERDDAP

示例#9
0
def get_standard_variables_and_metadata(server_link, standard_variable_list):

    # Get access to the server and find datasets associated with standard_name variable listed
    e = ERDDAP(server=server_link, protocol='tabledap', response='csv')

    # Define Filter for which datasets to look into
    kw = {
        'standard_name': ','.join(standard_variable_list),
        'min_lon': -180.0,
        'max_lon': 180.0,
        'min_lat': -90.0,
        'max_lat': 90.0,
        'min_time': '',
        'max_time': '',
        'cdm_data_type': ''
    }

    variable_to_groupby = [('latitude', 'degrees_north'),
                           ('longitude', 'degrees_east')]

    # Get available datasets from that server
    search_url = e.get_search_url(response='csv', **kw)
    datasets = pd.read_csv(search_url)

    # Print results
    print(e.server)
    print(
        str(len(datasets)) + " datasets contains " +
        ', '.join(standard_variable_list))

    # Loop through different data sets and create a metadata dataFrame
    df = pd.DataFrame(columns=['Dataset ID'])

    for index, row in datasets.iterrows():
        # Get Info from dataset (mostly min/max lat/long)
        print(row['Dataset ID'])
        info_url = e.get_info_url(dataset_id=row['Dataset ID'], response='csv')
        info = pd.read_csv(info_url)
        attribute_table = info.set_index(
            ['Row Type', 'Variable Name',
             'Attribute Name']).transpose()['attribute']

        # Try to get the distinct lat/long and time and depth range for that dataset, if it fails rely on the
        # ERDDAP metadata
        try:
            # If dataset is spread out geographically find distinct locations (may not work well for trajectory data)
            latlong_url = e.get_download_url(
                dataset_id=row['Dataset ID'],
                protocol='tabledap',
                variables=['latitude', 'longitude', 'time'])

            # Get add to the url commands to get distinct values and ordered with min and max time for each lat/long
            distinctMinMaxTime_url = latlong_url + '&distinct()&orderByMinMax(%22latitude%2Clongitude%2Ctime%22)'

            # Get lat/long and min/max depth for this dataset
            data = pd.read_csv(distinctMinMaxTime_url, header=[0, 1])

            # Group data by latitude/longitude and get min max values
            data_reduced = data.groupby(by=variable_to_groupby).agg(
                ['min', 'max']).reset_index()

            if info[(info['Variable Name'] == 'depth')].size > 0:
                latlongdepth_url = e.get_download_url(
                    dataset_id=row['Dataset ID'],
                    protocol='tabledap',
                    variables=['latitude', 'longitude', 'depth'])

                # Get add to the url commands to get distinct values and ordered with min and max depth for
                # each lat/long
                distinctMinMaxDepth_url = latlongdepth_url + \
                                          '&distinct()&orderByMinMax(%22latitude%2Clongitude%2Cdepth%22)'

                # Get lat/long and min/max depth for this dataset
                data_depth = pd.read_csv(distinctMinMaxDepth_url,
                                         header=[0, 1])

                # Group depth data by lat/long and get min max values
                data_depth_reduced = data_depth.groupby(
                    by=variable_to_groupby).agg(['min', 'max']).reset_index()

                # Merge depth values with time
                data_reduced = data_reduced.merge(data_depth_reduced,
                                                  on=variable_to_groupby,
                                                  how='left')

            # Merge multi index column names
            data_reduced.columns = data_reduced.columns.map(
                ' '.join).str.strip(' ')

        except Exception as exception_error:

            print('Failed to read: ' + str(exception_error))
            # If there's only one location, it could get the range from metadata

            # Find lat/long range of this dataset, if it's point we don't need to look into it
            min_latitude = float(attribute_table['NC_GLOBAL',
                                                 'geospatial_lat_min'].Value)
            max_latitude = float(attribute_table['NC_GLOBAL',
                                                 'geospatial_lat_max'].Value)
            min_longitude = float(attribute_table['NC_GLOBAL',
                                                  'geospatial_lon_min'].Value)
            max_longitude = float(attribute_table['NC_GLOBAL',
                                                  'geospatial_lon_max'].Value)

            # If min/max lat/long are the same don't go in the dataset
            if (min_latitude == max_latitude) & (min_longitude
                                                 == max_longitude):
                data_reduced = pd.DataFrame(columns=['Dataset ID'])
                data_reduced = {}
                data_reduced['latitude degrees_north'] = min_latitude
                data_reduced['longitude degrees_east'] = min_longitude

                if 'depth' in attribute_table.columns and 'actual_range' in attribute_table[
                        'depth'] and ('m'
                                      == attribute_table['depth',
                                                         'units']['Value']):

                    depth_range = np.array(
                        str.split(
                            attribute_table['depth', 'actual_range']['Value'],
                            ',')).astype(np.float)
                    data_reduced['depth m min'] = depth_range[0]
                    data_reduced['depth m max'] = depth_range[1]

                # Convert to DataFrame
                data_reduced = pd.DataFrame(data_reduced, index=[0])
                print('Retrieved metadata')
            else:
                # Won't handle data with multiple location that it can't retrieve the data
                continue

        # Add Standard Name Variable Name to table info['Attribute Name'] == 'geospatial_lat_min'
        for var in standard_variable_list:
            data_reduced[var] = ','.join(
                e.get_var_by_attr(dataset_id=row['Dataset ID'],
                                  standard_name=var))

        # Add cdm_data_type to table
        data_reduced['cdm_data_type'] = ','.join(
            info[info['Attribute Name'] == 'cdm_data_type']['Value'].values)

        # Add Dataset id to the table
        data_reduced['Dataset ID'] = row['Dataset ID']

        # Merge that dataset ID with previously downloaded data
        df = df.append(data_reduced)

    # Add server to dataFrame
    df['server'] = e.server

    # Save resulting dataframe to a CSV, file name is based on the server address
    file_name = re.sub('https*://', '', e.server)
    file_name = re.sub("[\./]", '_', file_name)
    file_name = 'Server_List_' + file_name + '.csv'

    print('Save result to ' + file_name)
    df.to_csv(file_name)

    return df
示例#10
0
class ErddapReader(Reader):
    """
    This class searches ERDDAP servers. There are 2 known_servers but
    others can be input too.

    Attributes
    ----------
    parallel: boolean
        If True, run with simple parallelization using `multiprocessing`.
        If False, run serially.
    known_server: string
        Two ERDDAP servers are built in to be known to this reader: "ioos" and
        "coastwatch".
    e: ERDDAP server instance
    e.protocol: string
        * "tabledap" (pandas, appropriate for reading as csv)
        * "griddap" (xarray, appropriate for reading as netcdf)
    e.server: string
        Return the server name
    columns: list
        Metadata columns
    name: string
        "erddap_ioos", "erddap_coastwatch", or a constructed string if the user
        inputs a new protocol and server.
    reader: string
        reader is defined as "ErddapReader".
    """
    def __init__(self,
                 known_server="ioos",
                 protocol=None,
                 server=None,
                 parallel=True):
        """
        Parameters
        ----------
        known_server: string, optional
            Two ERDDAP servers are built in to be known to this reader:
            "ioos" and "coastwatch".
        protocol, server: string, optional
            For a user-defined ERDDAP server, input the protocol as one of the
            following:
            * "tabledap" (pandas, appropriate for reading as csv)
            * "griddap" (xarray, appropriate for reading as netcdf)
            and the server address (such as
            "http://erddap.sensors.ioos.us/erddap" or
            "http://coastwatch.pfeg.noaa.gov/erddap").
        parallel: boolean
            If True, run with simple parallelization using `multiprocessing`.
            If False, run serially.
        """
        self.parallel = parallel

        # hard wire this for now
        filetype = "netcdf"

        # either select a known server or input protocol and server string
        if known_server == "ioos":
            protocol = "tabledap"
            server = "http://erddap.sensors.ioos.us/erddap"
            filetype = "netcdf"  # other option: "csv"
        elif known_server == "coastwatch":
            protocol = "griddap"
            server = "http://coastwatch.pfeg.noaa.gov/erddap"
            filetype = "netcdf"  # other option: "csv"
        elif known_server is not None:
            statement = (
                "either select a known server or input protocol and server string"
            )
            assert (protocol is not None) & (server is not None), statement
        else:
            known_server = urllib.parse.urlparse(server).netloc
            # known_server = server.strip("/erddap").strip("http://").replace(".", "_")
            statement = (
                "either select a known server or input protocol and server string"
            )
            assert (protocol is not None) & (server is not None), statement

        self.known_server = known_server
        self.e = ERDDAP(server=server)
        self.e.protocol = protocol
        self.e.server = server
        self.filetype = filetype

        # columns for metadata
        self.columns = [
            "geospatial_lat_min",
            "geospatial_lat_max",
            "geospatial_lon_min",
            "geospatial_lon_max",
            "time_coverage_start",
            "time_coverage_end",
            "defaultDataQuery",
            "subsetVariables",  # first works for timeseries sensors, 2nd for gliders
            "keywords",  # for hf radar
            "id",
            "infoUrl",
            "institution",
            "featureType",
            "source",
            "sourceUrl",
        ]

        # name
        self.name = f"erddap_{known_server}"

        self.reader = "ErddapReader"
        self.store = dict()

    def __getitem__(self, key):
        """Redefinition of dict-like behavior.

        This enables user to use syntax `reader[dataset_id]` to read in and
        save dataset into the object.

        Parameters
        ----------
        key: str
            dataset_id for a dataset that is available in the search/reader
            object.

        Returns
        -------
        xarray Dataset of the data associated with key
        """

        returned_data = self.data_by_dataset(key)
        # returned_data = self._return_data(key)
        self.__setitem__(key, returned_data)
        return returned_data

    def find_dataset_id_from_station(self, station):
        """Find dataset_id from station name.

        Parameters
        ----------
        station: string
            Station name for which to search for dataset_id
        """

        if station is None:
            return None
        # for station in self._stations:
        # if station has more than one word, AND will be put between
        # to search for multiple terms together.
        url = self.e.get_search_url(response="csv",
                                    items_per_page=5,
                                    search_for=station)

        try:
            df = pd.read_csv(url)
        except Exception as e:
            logger.exception(e)
            logger.warning(
                f"search url {url} did not work for station {station}.")
            return

        # first try for exact station match
        try:
            # Special case for TABS when don't split the id name
            if "tabs" in station:  # don't split
                dataset_id = [
                    dataset_id for dataset_id in df["Dataset ID"]
                    if station.lower() == dataset_id.lower()
                ][0]
            else:
                # first try as dataset_id then do as station name
                dataset_id = [
                    dataset_id for dataset_id in df["Dataset ID"]
                    if station.lower() in [dataset_id.lower()] +
                    dataset_id.lower().split("_")
                ][0]

        except Exception as e:
            logger.exception(e)
            logger.warning(
                "When searching for a dataset id to match station name %s, the first attempt to match the id did not work."
                % (station))
            # If that doesn't work, return None for dataset_id
            dataset_id = None
            # # if that doesn't work, trying for more general match and just take first returned option
            # dataset_id = df.iloc[0]["Dataset ID"]

        return dataset_id

    @property
    def dataset_ids(self):
        """Find dataset_ids for server.

        Notes
        -----
        The dataset_ids are found by querying the metadata through the ERDDAP server.

        The number of dataset_ids can change if a variable is removed from the
        list of variables and this is rerun.
        """

        if not hasattr(self, "_dataset_ids") or (
                self.variables and
            (len(self.variables) != self.num_variables)):

            # This should be a region search
            if self.approach == "region":

                # find all the dataset ids which we will use to get the data
                # This limits the search to our keyword arguments in kw which should
                # have min/max lon/lat/time values
                dataset_ids = []
                if self.variables is not None:
                    for variable in self.variables:

                        # find and save all dataset_ids associated with variable
                        search_url = self.e.get_search_url(
                            response="csv",
                            **self.kw,
                            variableName=variable,
                            items_per_page=10000,
                        )

                        try:
                            search = pd.read_csv(search_url)
                            dataset_ids.extend(search["Dataset ID"])
                        except Exception as e:
                            logger.exception(e)
                            logger.warning(
                                f"variable {variable} was not found in the search"
                            )
                            logger.warning(f"search_url: {search_url}")

                else:

                    # find and save all dataset_ids associated with variable
                    search_url = self.e.get_search_url(response="csv",
                                                       **self.kw,
                                                       items_per_page=10000)

                    try:
                        search = pd.read_csv(search_url)
                        dataset_ids.extend(search["Dataset ID"])
                    except Exception as e:
                        logger.exception(e)
                        logger.warning("nothing found in the search")
                        logger.warning(f"search_url: {search_url}")

                # only need a dataset id once since we will check them each for all standard_names
                self._dataset_ids = list(set(dataset_ids))

            # This should be a search for the station names
            elif self.approach == "stations":

                # search by station name for each of stations
                if self.parallel:
                    # get metadata for datasets
                    # run in parallel to save time
                    num_cores = multiprocessing.cpu_count()
                    dataset_ids = Parallel(n_jobs=num_cores)(
                        delayed(self.find_dataset_id_from_station)(station)
                        for station in self._stations)

                else:
                    dataset_ids = []
                    for station in self._stations:
                        dataset_ids.append(
                            self.find_dataset_id_from_station(station))

                # remove None from list
                dataset_ids = [i for i in dataset_ids if i]

                # In this case return all dataset_ids so they match 1-1 with
                # the input station list.
                self._dataset_ids = dataset_ids

            else:
                logger.warning(
                    "Neither stations nor region approach were used in function dataset_ids."
                )

            # update number of variables
            if self.variables:
                self.num_variables = len(self.variables)

        return self._dataset_ids

    def meta_by_dataset(self, dataset_id):
        """Return the catalog metadata for a single dataset_id."""

        info_url = self.e.get_info_url(response="csv", dataset_id=dataset_id)
        try:
            info = pd.read_csv(info_url)
        except Exception as e:
            logger.exception(e)
            logger.warning(f"Could not read info from {info_url}")
            return {dataset_id: []}

        items = []

        for col in self.columns:

            try:
                item = info[info["Attribute Name"] == col]["Value"].values[0]
                dtype = info[info["Attribute Name"] ==
                             col]["Data Type"].values[0]
            except:
                if col == "featureType":
                    # this column is not present in HF Radar metadata but want it to
                    # map to data_type, so input 'grid' in that case.
                    item = "grid"
                else:
                    item = "NA"

            if dtype == "String":
                pass
            elif dtype == "double":
                item = float(item)
            elif dtype == "int":
                item = int(item)
            items.append(item)

        # include download link ##
        self.e.dataset_id = dataset_id
        if self.e.protocol == "tabledap":
            # set the same time restraints as before
            self.e.constraints = {
                "time<=": self.kw["max_time"],
                "time>=": self.kw["min_time"],
            }
            if self.filetype == "csv":
                download_url = self.e.get_download_url(response="csvp")
            elif self.filetype == "netcdf":
                download_url = self.e.get_download_url(response="ncCf")

        elif self.e.protocol == "griddap":
            # the search terms that can be input for tabledap do not work for griddap
            # in erddapy currently. Instead, put together an opendap link and then
            # narrow the dataset with xarray.
            # get opendap link
            download_url = self.e.get_download_url(response="opendap")

        # check if "prediction" is present in metadata, esp in case of NOAA
        # model predictions
        is_prediction = "Prediction" in " ".join(
            list(info["Value"].replace(np.nan, None).values))

        # add erddap server name
        return {
            dataset_id:
            [self.e.server, download_url, info_url, is_prediction] + items +
            [self.variables]
        }

    @property
    def meta(self):
        """Rearrange the individual metadata into a dataframe.

        Notes
        -----
        This should exclude duplicate entries.
        """

        if not hasattr(self, "_meta"):

            if self.parallel:

                # get metadata for datasets
                # run in parallel to save time
                num_cores = multiprocessing.cpu_count()
                downloads = Parallel(n_jobs=num_cores)(
                    delayed(self.meta_by_dataset)(dataset_id)
                    for dataset_id in self.dataset_ids)

            else:

                downloads = []
                for dataset_id in self.dataset_ids:
                    downloads.append(self.meta_by_dataset(dataset_id))

            # make dict from individual dicts
            from collections import ChainMap

            meta = dict(ChainMap(*downloads))

            # Make dataframe of metadata
            # variable names are the column names for the dataframe
            self._meta = pd.DataFrame.from_dict(
                meta,
                orient="index",
                columns=[
                    "database", "download_url", "info_url", "is_prediction"
                ] + self.columns + ["variable names"],
            )

        return self._meta

    def data_by_dataset(self, dataset_id):
        """Return the data for a single dataset_id.

        Returns
        -------
        A tuple of (dataset_id, data), where data type is a pandas DataFrame

        Notes
        -----
        Data is read into memory.
        """

        if self.filetype == "csv":
            # if self.e.protocol == "tabledap":
            try:
                # fetch metadata if not already present
                # found download_url from metadata and use
                self.e.dataset_id = dataset_id
                # dataset_vars gives a list of the variables in the dataset
                dataset_vars = (self.meta.loc[dataset_id]
                                ["defaultDataQuery"].split("&")[0].split(","))
                # vars_present gives the variables in self.variables
                # that are actually in the dataset
                vars_present = []
                for selfvariable in self.variables:
                    vp = [var for var in dataset_vars if var == selfvariable]
                    if len(vp) > 0:
                        vars_present.append(vp[0])
                # If any variables are not present, this doesn't work.
                if self.variables is not None:
                    self.e.variables = [
                        "time",
                        "longitude",
                        "latitude",
                        "station",
                    ] + vars_present
                dd = self.e.to_pandas(response="csvp",
                                      index_col=0,
                                      parse_dates=True)
                # dd = self.e.to_pandas(response='csv', header=[0, 1],
                #                       index_col=0, parse_dates=True)
                # dd = pd.read_csv(
                #     download_url, header=[0, 1], index_col=0, parse_dates=True
                # )

                # Drop cols and rows that are only NaNs.
                dd = dd.dropna(axis="index", how="all").dropna(axis="columns",
                                                               how="all")

                if self.variables is not None:
                    # check to see if there is any actual data
                    # this is a bit convoluted because the column names are the variable names
                    # plus units so can't match 1 to 1.
                    datacols = (
                        0  # number of columns that represent data instead of metadata
                    )
                    for col in dd.columns:
                        datacols += [
                            varname in col for varname in self.variables
                        ].count(True)
                    # if no datacols, we can skip this one.
                    if datacols == 0:
                        dd = None

            except Exception as e:
                logger.exception(e)
                logger.warning("no data to be read in for %s" % dataset_id)
                dd = None

        elif self.filetype == "netcdf":
            # elif self.e.protocol == "griddap":

            if self.e.protocol == "tabledap":

                try:
                    # assume I don't need to narrow in space since time series (tabledap)
                    self.e.dataset_id = dataset_id
                    dd = self.e.to_xarray()
                    # dd = xr.open_dataset(download_url, chunks="auto")
                    dd = dd.swap_dims({"obs": dd.cf["time"].name})
                    dd = dd.sortby(dd.cf["time"], ascending=True)
                    dd = dd.cf.sel(
                        T=slice(self.kw["min_time"], self.kw["max_time"]))
                    # dd = dd.set_coords(
                    #     [dd.cf["longitude"].name, dd.cf["latitude"].name]
                    # )

                    # use variable names to drop other variables (should. Ido this?)
                    if self.variables is not None:
                        # I don't think this is true with new approach
                        # # ERDDAP prepends variables with 's.' in netcdf files,
                        # # so include those with variables
                        # erd_vars = [f's.{var}' for var in self.variables]
                        # var_list = set(dd.data_vars) - (set(self.variables) | set(erd_vars))
                        var_list = set(dd.data_vars) - set(self.variables)
                        dd = dd.drop_vars(var_list)

                    # the lon/lat are on the 'timeseries' singleton dimension
                    # but the data_var variable was not, which messed up
                    # cf-xarray. When longitude and latitude are not on a
                    # dimension shared with a variable, the variable can't be
                    # called with cf-xarray. e.g. dd.cf['ssh'] won't work.
                    if "timeseries" in dd.dims:
                        for data_var in dd.data_vars:
                            if "timeseries" not in dd[data_var].dims:
                                dd[data_var] = dd[data_var].expand_dims(
                                    dim="timeseries", axis=1)

                except Exception as e:
                    logger.exception(e)
                    logger.warning("no data to be read in for %s" % dataset_id)
                    dd = None

            elif self.e.protocol == "griddap":

                try:
                    # this makes it read in the whole file which might be large
                    self.e.dataset_id = dataset_id
                    # dd = self.e.to_xarray(chunks="auto").sel(
                    #     time=slice(self.kw["min_time"], self.kw["max_time"])
                    # )
                    download_url = self.e.get_download_url(response="opendap")
                    dd = xr.open_dataset(download_url, chunks="auto").sel(
                        time=slice(self.kw["min_time"], self.kw["max_time"]))

                    if ("min_lat" in self.kw) and ("max_lat" in self.kw):
                        dd = dd.sel(latitude=slice(self.kw["min_lat"],
                                                   self.kw["max_lat"]))

                    if ("min_lon" in self.kw) and ("max_lon" in self.kw):
                        dd = dd.sel(longitude=slice(self.kw["min_lon"],
                                                    self.kw["max_lon"]))

                    # use variable names to drop other variables (should. Ido this?)
                    if self.variables is not None:
                        vars_list = set(dd.data_vars) - set(self.variables)
                        dd = dd.drop_vars(vars_list)

                except Exception as e:
                    logger.exception(e)
                    logger.warning("no data to be read in for %s" % dataset_id)
                    dd = None

        # return (dataset_id, dd)
        return dd

    # @property
    def data(self, dataset_ids=None):
        """Read in data for some or all dataset_ids.

        NOT USED CURRENTLY

        Once data is read in for a dataset_ids, it is remembered.

        See full documentation in `utils.load_data()`.
        """

        output = odg.utils.load_data(self, dataset_ids)
        return output
    from erddapy import ERDDAP
    import pandas as pd

    e = ERDDAP(server = url_glider)

    # Search constraints
    kw2018 = {
            'min_lon': lon_lim[0],
            'max_lon': lon_lim[1],
            'min_lat': lat_lim[0],
            'max_lat': lat_lim[1],
            'min_time': date_ini,
            'max_time': date_end,
            }

    search_url = e.get_search_url(response='csv', **kw2018)
    search = pd.read_csv(search_url)
    
    # Extract the IDs
    gliders = search['Dataset ID'].values
    
#%%
    
    dataset_id = gliders[0]
    print(glid)
    
#    timeg,depthg_gridded,varg_gridded,timem,depthm,target_varm = \
#    glider_transect_model_com_erddap_server(url_glider,dataset_id,url_model,\
#                              lat_lim,lon_lim,\
#                              date_ini,date_end,var_glider,var_model,model_name,delta_z=0.4)
    
示例#12
0
class NDBC():
    def __init__(self, station_id, deploy_id, WMO, currentTime, startTime,
                 data_map, name_map):
        self.station_id = station_id
        self.deploy_id = deploy_id
        self.WMO = WMO
        self.now = currentTime
        self.startTime = startTime
        self.data_map = data_map
        self.name_map = name_map

    def adjust_pressure_to_sea_level(self, pres, temp, height):
        """Adjust barometric presure to sea-level."""
        temp = temp + 273.15
        slp = pres / np.exp(-height / (temp * 29.263))
        return slp

    def calculate_wind_speed(self, eastward, northward):
        """Calculate absolute wind speed from component wind vector."""
        u = np.square(eastward)
        v = np.square(northward)
        wind_speed = np.sqrt(u + v)
        return wind_speed

    def calculate_wind_direction(self, eastward, northward):
        """Calculate met wind direction from component wind vectors."""
        u = eastward
        v = northward
        wind_direction = 180 / np.pi * np.arctan2(-u, -v)
        return wind_direction

    def _connect_erddap(self,
                        server="http://ooivm1.whoi.net/erddap",
                        protocol="tabledap"):
        """Connect to the erddap server."""
        self._erddap = ERDDAP(server=server, protocol=protocol)

    def list_datasets(self):
        """Get the available datasets for the ERDDAP server."""
        # First, make the connection
        self._connect_erddap()
        # Next, get the datasets
        datasets = pd.read_csv(
            self._erddap.get_search_url(search_for=self.station_id,
                                        response='csv'))['Dataset ID']
        return datasets

    def get_dataset(self, dataset):
        """Get the data for specified datasets."""
        # First, have to re-establish the erddap connection
        self._connect_erddap()

        # Next, get the data for a dataset
        self._erddap.dataset_id = dataset

        # Only want the variables with standard names
        variables = self._erddap.get_var_by_attr(
            standard_name=lambda v: v is not None)
        self._erddap.variables = variables

        # Limit the data request to the current deployment
        self._erddap.constraints = {
            'deploy_id=': self.deploy_id,
            'time>=': self.startTime.strftime('%Y-%m-%dT%H:%M:%SZ')
        }

        try:
            # Download the data
            data = self._erddap.to_pandas(index_col='time (UTC)',
                                          parse_dates=True)

            # Sometimes it just returns an empty dataframe instead of an error
            if data.size == 0:
                data = self._create_empty_dataset()

        except:
            # If there is no available data in the requested time window, need
            # to create an empty dataframe of the data
            data = self._create_empty_dataset()

        # Return the dataset data
        return data

    def process_METBK_data(self, df, freq='10T'):
        """Process the METBK into the correct format and values for NDBC."""
        # Resample the data
        df_binned = df.resample(freq).mean()

        # Check that barometric pressure
        if 'barometric_pressure (mbar)' in df_binned.columns:
            # Adjust the barometric pressure to sea-level
            df_binned[
                'sea_level_pressure (hPa)'] = self.adjust_pressure_to_sea_level(
                    df_binned['barometric_pressure (mbar)'],
                    df_binned['air_temperature (degree_Celsius)'], 4.05)
        else:
            df_binned['sea_level_pressure (hPa)'] = np.nan

        # Check that the wind vector components are in the dataframe
        if 'eastward_wind_velocity (m s-1)' in df_binned.columns:
            # Calculate the wind speed
            df_binned['wind speed (m/s)'] = self.calculate_wind_speed(
                df_binned['eastward_wind_velocity (m s-1)'],
                df_binned['northward_wind_velocity (m s-1)'])

            # Calculate the wind direction
            df_binned['wind direction'] = self.calculate_wind_direction(
                df_binned['eastward_wind_velocity (m s-1)'],
                df_binned['northward_wind_velocity (m s-1)'])
            df_binned['wind direction'] = df_binned["wind direction"].apply(
                lambda x: x + 360 if x < 0 else x)

            # Don't need cardinal direction -> want direction in degrees
            # df_binned["wind direction"] = df_binned["wind direction"].apply(
            #   lambda x: self.get_cardinal_direction(np.round(x, decimals=2)))
        else:
            df_binned['wind speed (m/s)'] = np.nan
            df_binned['wind direction'] = np.nan

        # Return the processed data
        return df_binned

    def process_WAVSS_data(self, df, freq='10T'):
        """Much simpler function for processing the WAVSS data."""
        # Resample the data
        df_binned = df.resample(freq).mean()

        # Return the data
        return df_binned

    def _create_empty_dataset(self):
        """
        Create a dataset of all nans if there is no data available for
        the requested dataset in the given time period.
        """
        # Get the units for the corresponding variables
        info_url = self._erddap.get_info_url(
            dataset_id=self._erddap.dataset_id, response='csv')
        info = pd.read_csv(info_url)
        units = info[info['Attribute Name'] == 'units']

        # Now, add the units to the variable names
        columns = []
        for var in self._erddap.variables:
            unit = units[units['Variable Name'] == var]['Value'].values
            if len(unit) == 0:
                columns.append(f'{var}')
            elif var == 'time':
                pass
            else:
                columns.append(f'{var} ({unit[0]})')

        # Create an array of nans to fill out the empty dataframe
        empty_array = np.empty((2, len(columns)))
        empty_array[:] = np.nan

        # Put the empty array into a dataframe
        empty_df = pd.DataFrame(data=empty_array,
                                columns=columns,
                                index=[self.startTime, self.now])
        empty_df.index.name = 'time (UTC)'

        return empty_df

    def process_datasets(self, datasets):
        """Process the data for individual datasets."""
        self.datasets = datasets

        # Get the data for the individual datasets
        for dset in self.datasets.keys():
            self.datasets.update({dset: self.get_dataset(dset)})

        # Process the data
        for dset in self.datasets.keys():
            if 'METBK' in dset:
                self.datasets[dset] = self.process_METBK_data(
                    self.datasets[dset])
            else:
                self.datasets[dset] = self.process_WAVSS_data(
                    self.datasets[dset])

        # Add a header to the data in the datasets
        for key in self.datasets.keys():
            header = key.split('-', 2)[-1]
            for col in self.datasets.get(key).columns:
                self.datasets.get(key).rename(
                    columns={col: ' '.join((header, col))}, inplace=True)

    def parse_data_to_xml(self, data):
        """
        Function which takes in the 10-minute average buoy data,
        the station name, and two dictionaries which map the buoy
        column names to the xml tags, and outputs an xml file in
        the NDBC format.

        Returns:
            xml - a properly constructed xml file in the NDBC
            format for the given buoy data
        """

        # Start the xml file
        xml = ['<?xml version="1.0" encoding="ISO-8859-1"?>']

        # Iterate through the data
        for index in data.index:

            # Get the data associated with a row in the dataframe
            row = data.loc[index]

            # Reset a dictionary of the data
            xml_data = {}
            for key in self.data_map.keys():
                xml_data.update({key: self.data_map.get(key)})

            # Parse the data into the data dictionary
            for key in xml_data.keys():
                # Get the column name which corresponds to the ndbc tag
                column = self.name_map.get(key)
                # Check that the column was returned from the ERDDAP server
                if column in row.index:
                    value = row[column]
                    # If a nan, just leave it the default -9999
                    if str(value) == 'nan':
                        pass
                    else:
                        xml_data[key] = value
                # If no data, leave it as default -9999
                else:
                    pass

            # Write the parsed data to the xml file
            # Start the message
            xml.append('<message>')

            # Add in the station id
            xml.append(f'  <station>{self.WMO}</station>')

            # Get the time index
            time = row.name.strftime('%m/%d/%Y %H:%M:%S')
            xml.append(f'  <date>{time}</date>')

            # Missing fill value
            missing = str(-9999)
            xml.append(f'  <missing>{missing}</missing>')

            # Roundtime
            xml.append('  <roundtime>no</roundtime>')

            # Start of the data
            xml.append('  <met>')

            # Add in each data piece
            for tag in xml_data.keys():
                # Get the value
                value = xml_data.get(tag)
                value = str(value)
                # Add the data to the xml file
                xml.append(f'    <{tag}>{value}</{tag}>')

            # Finish off the message
            xml.append('  </met>')
            xml.append('</message>')

        # Return the results
        return xml
示例#13
0
    'latitude>=': 37.0,
    'latitude<=': 43.43,
    'longitude>=': 317.56,
    'longitude<=': 322.87,
}
###specifying the variables(columns name) to be retrived.
e.variables = [
    'sample',
    'latitude',
    'longitude',
    'life_stage',
    'abundance',
    'time',
]
### searching for the server link and doing the handshaking process.
search_url = e.get_search_url(response='csv')
### receiving requested data and saving it into a dataframe
search = pd.read_csv(search_url)
df = e.to_pandas()
### receiving current working directory and saving the dataframe into a single csv file in that path.
wd = os.getcwd()
df.to_csv(wd + '/DataFiles/plankton_swocecpr.csv')

#%%
"""
#########################################################################################################################
##Read the bio/none_bio datafile for plankton data and filter them based on Depth and total wet mass and place a point regarding 
##for each data point lon and lat into a map.
#########################################################################################################################
"""
import os
示例#14
0
# # Exploring an ERDDAP server

# In[5]:

from erddapy import ERDDAP


e = ERDDAP(server='https://erddap-uncabled.oceanobservatories.org/uncabled/erddap')


# In[6]:

import pandas as pd


df = pd.read_csv(e.get_search_url(response='csv', search_for='all'))


# In[7]:

'We have {} tabledap, {} griddap, and {} wms endpoints.'.format(
    len(set(df['tabledap'].dropna())),
    len(set(df['griddap'].dropna())),
    len(set(df['wms'].dropna()))
)


# # ERDDAP Advanced Search
# 
# Let's narrow the search area, time span, and look for *sea_water_temperature* only.
示例#15
0
 def list_data(self, verbose=False):
     e = ERDDAP(server=self.server_url)
     self.df = pd.read_csv(
         e.get_search_url(response='csv', search_for=self.glider_id))
     if verbose:
         print(self.df['Dataset ID'])
示例#16
0
def active_gliders(bbox=None, time_start=None, time_end=dt.date.today(), glider_id=None):
    bbox = bbox or [-100, -40, 18, 60]
    time_start = time_start or (time_end - dt.timedelta(days=1))
    t0 = time_start.strftime('%Y-%m-%dT%H:%M:%SZ')
    t1 = time_end.strftime('%Y-%m-%dT%H:%M:%SZ')
    glider_id = glider_id or None

    e = ERDDAP(server='NGDAC')

    # Grab every dataset available
    # datasets = pd.read_csv(e.get_search_url(response='csv', search_for='all'))

    # Search constraints
    kw = dict()
    kw['min_time'] = t0
    kw['max_time'] = t1

    if bbox:
        kw['min_lon'] = bbox[0]
        kw['max_lon'] = bbox[1]
        kw['min_lat'] = bbox[2]
        kw['max_lat'] = bbox[3]

    if glider_id:
        search = glider_id
    else:
        search = None

    search_url = e.get_search_url(search_for=search, response='csv', **kw)

    try:
        # Grab the results
        search = pd.read_csv(search_url)
    except:
        # return empty dataframe if there are no results
        return pd.DataFrame()

    # Extract the IDs
    gliders = search['Dataset ID'].values

    msg = 'Found {} Glider Datasets:\n\n{}'.format
    print(msg(len(gliders), '\n'.join(gliders)))

    # Setting constraints
    constraints = {
            'time>=': t0,
            'time<=': t1,
            'longitude>=': bbox[0],
            'longitude<=': bbox[1],
            'latitude>=': bbox[2],
            'latitude<=': bbox[3],
            }

    variables = [
            'depth',
            'latitude',
            'longitude',
            'time',
            'temperature',
            'salinity',
            ]

    e = ERDDAP(
            server='NGDAC',
            protocol='tabledap',
            response='nc'
    )

    glider_dfs = []

    for id in gliders:
        # print('Reading ' + id)
        e.dataset_id = id
        e.constraints = constraints
        e.variables = variables

        # checking data frame is not empty
        try:
            df = e.to_pandas(
                index_col='time (UTC)',
                parse_dates=True,
                skiprows=(1,)  # units information can be dropped.
            ).dropna()
        except:
            continue
        df = df.reset_index()
        df['dataset_id'] = id
        df = df.set_index(['dataset_id', 'time (UTC)'])
        glider_dfs.append(df)

    try:
        ndf = pd.concat(glider_dfs)
    except ValueError:
        return pd.DataFrame()

    return ndf
示例#17
0
class GdacClient(object):
    def __init__(self, erddap_url=None):

        self._logger = logging.getLogger(os.path.basename(__file__))

        self._erddap_url = erddap_url or 'https://gliders.ioos.us/erddap'
        self._protocol = 'tabledap'
        self._response_type = 'csv'
        self._items_per_page = 1e10
        self._page = 1
        self._client = ERDDAP(server=self._erddap_url,
                              protocol=self._protocol,
                              response=self._response_type)
        self._last_request = None

        # DataFrame containing the results of ERDDAP advanced search (endpoints, etc.)
        self._datasets_info = pd.DataFrame()
        # DataFrame containing dataset_id, start/end dates, profile count, etc.
        self._datasets_summaries = pd.DataFrame()
        self._datasets_profiles = pd.DataFrame()
        self._datasets_days = pd.DataFrame()

        self._profiles_variables = [
            'time', 'latitude', 'longitude', 'profile_id', 'wmo_id'
        ]

        self._valid_search_kwargs = {
            'institution', 'ioos_category', 'long_name', 'standard_name',
            'variable_name', 'min_lon', 'min_lat', 'max_lon', 'max_lat',
            'min_time', 'max_time'
        }

        self._months = [
            'January', 'February', 'March', 'April', 'May', 'June', 'July',
            'August', 'September', 'October', 'November', 'December'
        ]

        self._calendar_types = ['datasets', 'days', 'profiles']

    @property
    def datasets_info(self):
        return self._datasets_info

    @property
    def datasets_summaries(self):
        return self._datasets_summaries

    @property
    def datasets_profiles(self):
        return self._datasets_profiles

    @property
    def datasets_days(self):
        return self._datasets_days

    @property
    def dataset_ids(self):
        if self._datasets_summaries.empty:
            self._logger.warning('No data sets found')
            return

        return list(self._datasets_info['dataset_id'].values)

    @property
    def gliders(self):
        if self._datasets_summaries.empty:
            self._logger.warning('No data sets found')
            return

        return list(self._datasets_summaries.glider.unique())

    @property
    def profiles_per_yyyymmdd(self):
        return self._datasets_profiles.sum(axis=1)

    @property
    def profiles_per_year(self):
        return self._datasets_profiles.sum(
            axis=1).groupby(lambda x: x.year).sum()

    @property
    def glider_days_per_yyyymmdd(self):
        return self._datasets_days.sum(axis=1)

    @property
    def glider_days_per_year(self):
        return self._datasets_days.sum(axis=1).groupby(lambda x: x.year).sum()

    @property
    def deployments_per_yyyymmdd(self):
        return self._datasets_days.sum(axis=1)

    @property
    def deployments_per_year(self):
        return self._datasets_days.groupby(lambda x: x.year).any().sum(axis=1)

    @property
    def yearly_counts(self):

        columns = [
            self.deployments_per_year, self.glider_days_per_year,
            self.profiles_per_year
        ]
        totals = pd.DataFrame(columns).transpose().astype('i')
        totals.columns = ['deployments', 'glider days', 'profiles']
        totals.index.name = 'year'

        return totals

    @property
    def e(self):
        """erddapy.ERDDAP client"""
        return self._client

    @property
    def server(self):
        return self._client.server

    @property
    def response_type(self):
        return self._client.response

    @response_type.setter
    def response_type(self, response_type):
        self._client.response = response_type

    @property
    def last_request(self):
        return self._last_request

    def get_glider_datasets(self, glider):

        return self._datasets_summaries[self._datasets_summaries.glider ==
                                        glider].reset_index().drop('index',
                                                                   axis=1)

    def get_deployments_calendar(self, year=None):
        if not year:
            return self._datasets_days.groupby(
                [lambda x: x.year,
                 lambda x: x.month]).any().sum(axis=1).unstack()
        else:
            glider_days_by_yymmdd = self._datasets_days
            years = pd.to_datetime(glider_days_by_yymmdd.index).year.unique()
            if year not in years:
                self._logger.warning(
                    'No glider days found in year {:}'.format(year))
                return pd.DataFrame()
            return glider_days_by_yymmdd[pd.to_datetime(
                glider_days_by_yymmdd.index).year == year].groupby(
                    [lambda x: x.month,
                     lambda x: x.day]).any().sum(axis=1).unstack()

    def get_glider_days_calendar(self, year=None):
        if not year:
            return self._datasets_days.sum(axis=1).groupby(
                [lambda x: x.year, lambda x: x.month]).sum().unstack()
        else:
            glider_days_by_yymmdd = self._datasets_days.sum(axis=1)
            years = pd.to_datetime(glider_days_by_yymmdd.index).year.unique()
            if year not in years:
                self._logger.warning(
                    'No glider days found in year {:}'.format(year))
                return pd.DataFrame()
            return glider_days_by_yymmdd[pd.to_datetime(
                glider_days_by_yymmdd.index).year == year].groupby(
                    [lambda x: x.month, lambda x: x.day]).sum().unstack()

    def get_profiles_calendar(self, year=None):
        if not year:
            return self._datasets_profiles.sum(axis=1).groupby(
                [lambda x: x.year, lambda x: x.month]).sum().unstack()
        else:
            profiles_by_yymmdd = self._datasets_profiles.sum(axis=1)
            years = pd.to_datetime(profiles_by_yymmdd.index).year.unique()
            if year not in years:
                self._logger.warning(
                    'No profiles found in year {:}'.format(year))
                return pd.DataFrame()
            return profiles_by_yymmdd[pd.to_datetime(
                profiles_by_yymmdd.index).year == year].groupby(
                    [lambda x: x.month, lambda x: x.day]).sum().unstack()

    def search_datasets(self, search_for=None, delayedmode=False, **kwargs):
        """Search the ERDDAP server for glider deployment datasets.  Results are stored as pandas DataFrames in:

        self.deployments
        self.datasets

        Equivalent to ERDDAP's Advanced Search.  Searches can be performed by free text, bounding box, time bounds, etc.
        See the erddapy documentation for valid kwargs"""

        url = self._client.get_search_url(search_for=search_for, **kwargs)
        self._last_request = url

        glider_regex = re.compile(r'^(.*)-\d{8}T\d{4}')
        try:
            self._datasets_info = pd.read_csv(url)
            # Drop the allDatasets row
            self._datasets_info.drop(self._datasets_info[
                self._datasets_info['Dataset ID'] == 'allDatasets'].index,
                                     inplace=True)

            # Reset the index to start and 0
            self._datasets_info.reset_index(inplace=True)
            # Drop the index, griddap wms columns
            self._datasets_info.drop(['index', 'griddap', 'wms'],
                                     axis=1,
                                     inplace=True)

            # rename columns more friendly
            columns = {
                s: s.replace(' ', '_').lower()
                for s in self._datasets_info.columns
            }
            self._datasets_info.rename(columns=columns, inplace=True)

            if not delayedmode:
                self._datasets_info = self._datasets_info[
                    ~self._datasets_info.dataset_id.str.endswith('delayed')]

            # Iterate through each data set (except for allDatasets) and grab the info page
            datasets = []
            daily_profiles = []
            datasets_days = []
            for i, row in self._datasets_info.iterrows():

                if row['dataset_id'] == 'allDatasets':
                    continue

                if delayedmode and not row['dataset_id'].endswith('delayed'):
                    continue
                elif row['dataset_id'].endswith('delayed'):
                    continue

                self._logger.info('Fetching dataset: {:}'.format(
                    row['dataset_id']))

                # Get the data download url for erddap_vars
                try:
                    data_url = self._client.get_download_url(
                        dataset_id=row['dataset_id'],
                        variables=self._profiles_variables)
                except (ConnectionError, ConnectionRefusedError,
                        urllib3.exceptions.MaxRetryError) as e:
                    self._logger.error('{:} fetch failed: {:}'.format(
                        row['dataset_id'], e))
                    continue

                # Fetch the profiles into a pandas dataframe
                try:
                    profiles = pd.read_csv(data_url,
                                           skiprows=[1],
                                           index_col='time',
                                           parse_dates=True).sort_index()
                except HTTPError as e:
                    self._logger.error(
                        'Failed to fetch profiles: {:}'.format(e))
                    continue

                # Group profiles by yyyy-mm-dd and sum the number of profiles per day
                s = profiles.profile_id.dropna().groupby(
                    lambda x: x.date).count()
                s.name = row['dataset_id']
                daily_profiles.append(s)

                # Create the deployment date range
                d_index = pd.date_range(s.index.min(), s.index.max())
                deployment_days = pd.Series([1 for x in d_index],
                                            index=d_index,
                                            name=row['dataset_id'])
                datasets_days.append(deployment_days)

                glider_match = glider_regex.match(row['dataset_id'])
                glider = glider_match.groups()[0]

                # First profile time
                dt0 = profiles.index.min()
                # Last profile time
                dt1 = profiles.index.max()
                # Deployment length in days
                days = ceil((dt1 - dt0).total_seconds() / 86400)

                dataset_summary = [
                    glider, row['dataset_id'],
                    str(profiles.wmo_id.unique()[0]), dt0, dt1,
                    profiles.iloc[0]['latitude'],
                    profiles.iloc[0]['longitude'],
                    profiles.latitude.min(),
                    profiles.latitude.max(),
                    profiles.longitude.min(),
                    profiles.longitude.max(), profiles.shape[0], days
                ]

                datasets.append(dataset_summary)

            columns = [
                'glider', 'dataset_id', 'wmo_id', 'start_date', 'end_date',
                'deployment_lat', 'deployment_lon', 'lat_min', 'lat_max',
                'lon_min', 'lon_max', 'num_profiles', 'days'
            ]

            self._datasets_summaries = pd.DataFrame(datasets, columns=columns)

            # Create and store the DataFrame containing a 1 on each day the glider was deployed, 0 otherwise
            self._datasets_days = pd.concat(datasets_days, axis=1).sort_index()

            # Create and store the DataFrame containing the number of profiles on each day for each deployment
            self._datasets_profiles = pd.concat(daily_profiles,
                                                axis=1).sort_index()

        except HTTPError as e:
            self._logger.error(e)

        return

    def get_dataset_info(self, dataset_id):
        """Fetch the dataset metadata for the specified dataset_id"""

        if dataset_id not in self.dataset_ids:
            self._logger.error('Dataset id {:} not found in {:}'.format(
                dataset_id, self.__repr__()))
            return

        info = self._datasets_info[self._datasets_info.dataset_id ==
                                   dataset_id]
        info.reset_index(inplace=True)
        return info.drop('index', axis=1).transpose()

    def get_dataset_profiles(self, dataset_id):
        """Fetch all profiles (time, latitude, longitude, profile_id) for the specified dataset.  Profiles are sorted
        by ascending time"""

        if dataset_id not in self.dataset_ids:
            self._logger.error('Dataset id {:} not found in {:}'.format(
                dataset_id, self.__repr__()))
            return

        url = self._client.get_download_url(dataset_id=dataset_id,
                                            variables=self._profiles_variables)

        return pd.read_csv(url,
                           parse_dates=True,
                           skiprows=[1],
                           index_col='time').sort_index()

    def get_dataset_time_coverage(self, dataset_id):
        """Get the time coverage and wmo id (if specified) for specified dataset_id """
        if dataset_id not in self.dataset_ids:
            self._logger.error('Dataset id {:} not found in {:}'.format(
                dataset_id, self.__repr__()))
            return

        return self._datasets_summaries[[
            'dataset_id', 'start_date', 'end_date', 'wmo_id'
        ]].iloc[self.dataset_ids.index(dataset_id)]

    def get_dataset_time_series(self,
                                dataset_id,
                                variables,
                                min_time=None,
                                max_time=None):
        """Fetch the variables time-series for the specified dataset_id.  A time window can be specified using min_time
        and max_time, which must be ISO-8601 formatted date strings (i.e.: 'YYYY-mm-ddTHH:MM')

        Parameters
        dataset_id: valid dataset id from self.datasets
        variables: list of one or more valid variables in the dataset

        Options
        min_time: minimum time value formatted as 'YYYY-mm-ddTHH:MM[:SS]'
        max_time: maximum time value formatted as 'YYYY-mm-ddTHH:mm[:SS]'
        """
        if dataset_id not in self.dataset_ids:
            self._logger.error('Dataset id {:} not found in {:}'.format(
                dataset_id, self.__repr__()))
            return

        if not isinstance(variables, list):
            variables = [variables]

        all_variables = ['precise_time', 'time', 'depth'] + variables
        variables = set(all_variables)

        constraints = {}
        if min_time:
            constraints['precise_time>='] = min_time
        if max_time:
            constraints['precise_time<='] = max_time

        # Not sure why, but pd.read_csv doesn't like percent UNENCODED urls on data requests, so percent escape special
        # characters prior to sending the data request.
        data_url = self.encode_url(
            self._client.get_download_url(dataset_id=dataset_id,
                                          variables=variables,
                                          constraints=constraints))

        return pd.read_csv(
            data_url, skiprows=[1],
            parse_dates=True).set_index('precise_time').sort_index()

    def plot_yearly_totals(self,
                           totals_type=None,
                           palette='Blues_d',
                           **kwargs):
        """Bar chart plot of deployments, glider days and profiles, grouped by year"""
        totals = self.yearly_counts.reset_index()

        if totals_type and totals_type not in totals.columns:
            self._logger.error(
                'Invalid category specified: {:}'.format(totals_type))
            return

        if not totals_type:
            fig, (ax1, ax2, ax3) = plt.subplots(3,
                                                1,
                                                figsize=(8.5, 11),
                                                sharex=True)
            sns.barplot(x='year',
                        y='deployments',
                        ax=ax1,
                        data=totals,
                        palette=palette,
                        **kwargs)
            sns.barplot(x='year',
                        y='glider days',
                        ax=ax2,
                        data=totals,
                        palette=palette,
                        **kwargs)
            sns.barplot(x='year',
                        y='profiles',
                        ax=ax3,
                        data=totals,
                        palette=palette,
                        **kwargs)

            ax2.set_xlabel('')
            ax1.set_xlabel('')

            ax1.set_title('U.S. IOOS Glider Data Assembly Center')

            return fig, ax1, ax2, ax3

        else:
            ax = sns.barplot(x='year',
                             y=totals_type,
                             data=totals,
                             palette=palette,
                             **kwargs)
            ax.set_title('U.S. IOOS Glider Data Assembly Center')

            return ax.figure, ax

    def plot_datasets_calendar(self, calendar_type, year=None, cmap=None):
        """Heatmap of the specified calendar_type"""
        if calendar_type not in self._calendar_types:
            self._logger.error(
                'Invalid calendar type specified: {:}'.format(calendar_type))
            return

        if calendar_type == 'datasets':
            if not year:
                data = self.get_deployments_calendar()
                title = 'Active Real-Time Datasets'
            else:
                data = self.get_deployments_calendar(year)
                title = 'Active Real-Time Datasets: {:}'.format(year)
        elif calendar_type == 'days':
            if not year:
                data = self.get_glider_days_calendar()
                data.columns = self._months
                title = 'Glider In-Water Days'
            else:
                data = self.get_glider_days_calendar(year)
                title = 'Glider In-Water Days: {:}'.format(year)
        elif calendar_type == 'profiles':
            if not year:
                data = self.get_profiles_calendar()
                data.columns = self._months
                title = 'Real-Time Profiles'
            else:
                data = self.get_profiles_calendar(year)
                title = 'Real-Time Profiles: {:}'.format(year)
        else:
            self._logger.error(
                'Unknown calendar type: {:}'.format(calendar_type))
            return

        if data.empty:
            self._logger.warning('No results found')
            return

        if year:
            data.index = self._months
            plt.figure(figsize=(8.5, 4.))
            cb = True
            annotate = False
        else:
            data.columns = self._months
            plt.figure(figsize=(8.5, 8.5))
            cb = False
            annotate = True

        if cmap:
            ax = sns.heatmap(data,
                             annot=annotate,
                             fmt='.0f',
                             square=True,
                             cbar=cb,
                             linewidths=0.5,
                             cmap=cmap)
        else:
            ax = sns.heatmap(data,
                             annot=annotate,
                             fmt='.0f',
                             square=True,
                             cbar=cb,
                             linewidths=0.5)

        ax.invert_yaxis()
        _ = [ytick.set_rotation(0) for ytick in ax.get_yticklabels()]
        ax.set_title(title)

        return ax

    def plot_dataset_profiles_calendar(self, dataset_id, **heatmap_kwargs):
        """Plot the heatmap profiles/day calendar for the specified dataset"""
        if dataset_id not in self.dataset_ids:
            self._logger.error('Dataset id {:} not found in {:}'.format(
                dataset_id, self.__repr__()))
            return

        profiles = self.get_dataset_profiles(dataset_id)
        if profiles.empty:
            self._logger.warning(
                'No profiles found for dataset: {:}'.format(dataset_id))
            return

        pgroup = profiles.latitude.groupby(
            [lambda x: x.year, lambda x: x.month, lambda x: x.day]).count()
        calendar = pgroup.unstack()

        annotate = True
        square = True
        cbar = False
        annot_kws = {'fontsize': 10}
        annot_kws = {}

        fig = plt.figure(figsize=(11, 8.5))

        ax = sns.heatmap(calendar,
                         annot=annotate,
                         fmt='.0f',
                         square=square,
                         cbar=cbar,
                         linewidths=0.5,
                         annot_kws=annot_kws)

        # Format default y-tick labels to 'mmm YYYY'
        ylabels = [y.get_text() for y in ax.get_yticklabels()]
        new_ylabels = []
        for ylabel in ylabels:
            y, m = ylabel.split('-')
            new_ylabels.append('{:} {:}'.format(self._months[int(m) - 1][0:3],
                                                y))
        ax.set_yticklabels(new_ylabels)

        ax.set_ylabel('')
        ax.invert_yaxis()
        _ = [ytick.set_rotation(0) for ytick in ax.get_yticklabels()]

        ax.set_title('Profiles: {:}'.format(dataset_id))

        return ax

    @staticmethod
    def encode_url(data_url):
        """Percent encode special url characters."""
        url_pieces = list(urlsplit(data_url))
        url_pieces[3] = quote(url_pieces[3])

        return urlunsplit(url_pieces)

    def __repr__(self):
        return "<GdacClient(server='{:}', response='{:}', num_datasets={:})>".format(
            self._client.server, self._client.response,
            len(self._datasets_info))
示例#18
0
class ErddapReader:
    

    def __init__(self, known_server='ioos', protocol=None, server=None, parallel=True):
        
#         # run checks for KW 
#         self.kw = kw

        self.parallel = parallel
    
        
        # either select a known server or input protocol and server string
        if known_server == 'ioos':
            protocol = 'tabledap'
            server = 'http://erddap.sensors.ioos.us/erddap'
        elif known_server == 'coastwatch':
            protocol = 'griddap'
            server = 'http://coastwatch.pfeg.noaa.gov/erddap'
        elif known_server is not None:
            statement = 'either select a known server or input protocol and server string'
            assert (protocol is not None) & (server is not None), statement
        else:
            known_server = server.strip('/erddap').strip('http://').replace('.','_')
            statement = 'either select a known server or input protocol and server string'
            assert (protocol is not None) & (server is not None), statement
        
        self.known_server = known_server
        self.e = ERDDAP(server=server)
        self.e.protocol = protocol
        self.e.server = server
                  
        # columns for metadata
        self.columns = ['geospatial_lat_min', 'geospatial_lat_max', 
               'geospatial_lon_min', 'geospatial_lon_max', 
               'time_coverage_start', 'time_coverage_end',
               'defaultDataQuery', 'subsetVariables',  # first works for timeseries sensors, 2nd for gliders
               'keywords',  # for hf radar
               'id', 'infoUrl', 'institution', 'featureType', 'source', 'sourceUrl']
        
        # name
        self.name = f'erddap_{known_server}'
        
        self.reader = 'ErddapReader'
        
# #         self.data_type = data_type
#         self.standard_names = standard_names
#         # DOESN'T CURRENTLY LIMIT WHICH VARIABLES WILL BE FOUND ON EACH SERVER

    
    
    @property
    def dataset_ids(self):
        '''Find dataset_ids for server.'''
        
        if not hasattr(self, '_dataset_ids'):
            
            # This should be a region search
            if self.approach == 'region':
        
                # find all the dataset ids which we will use to get the data
                # This limits the search to our keyword arguments in kw which should 
                # have min/max lon/lat/time values
                dataset_ids = []
                if self.variables is not None:
                    for variable in self.variables:

                        # find and save all dataset_ids associated with variable
                        search_url = self.e.get_search_url(response="csv", **self.kw, 
                                                           variableName=variable, 
                                                           items_per_page=10000)

                        try:
                            search = pd.read_csv(search_url)
                            dataset_ids.extend(search["Dataset ID"])
                        except Exception as e:
                            logger_erd.exception(e)
                            logger_erd.warning(f"variable {variable} was not found in the search")
                            logger_erd.warning(f'search_url: {search_url}')

                else:
                    
                    # find and save all dataset_ids associated with variable
                    search_url = self.e.get_search_url(response="csv", **self.kw, 
                                                       items_per_page=10000)

                    try:
                        search = pd.read_csv(search_url)
                        dataset_ids.extend(search["Dataset ID"])
                    except Exception as e:
                        logger_erd.exception(e)
                        logger_erd.warning(f"nothing found in the search")
                        logger_erd.warning(f'search_url: {search_url}')

                    
                # only need a dataset id once since we will check them each for all standard_names
                self._dataset_ids = list(set(dataset_ids))
            
            # This should be a search for the station names
            elif self.approach == 'stations':
#             elif self._stations is not None:
                
                # search by station name for each of stations
                dataset_ids = []
                for station in self._stations:
                    # if station has more than one word, AND will be put between to search for multiple 
                    # terms together
                    url = self.e.get_search_url(response="csv", items_per_page=5, search_for=station)

                    try:
                        df = pd.read_csv(url)
                    except Exception as e:
                        logger_erd.exception(e)
                        logger_erd.warning(f'search url {url} did not work for station {station}.')
                        continue
    
                    # first try for exact station match
                    try:
                        dataset_id = [dataset_id for dataset_id in df['Dataset ID'] if station.lower() in dataset_id.lower().split('_')][0]

                    # if that doesn't work, trying for more general match and just take first returned option
                    except Exception as e:
                        logger_erd.exception(e)
                        logger_erd.warning('When searching for a dataset id to match station name %s, the first attempt to match the id did not work.' % (station))
                        dataset_id = df.iloc[0]['Dataset ID']
        
#                         if 'tabs' in org_id:  # don't split
#                             axiom_id = [axiom_id for axiom_id in df['Dataset ID'] if org_id.lower() == axiom_id.lower()]
#                         else:
#                             axiom_id = [axiom_id for axiom_id in df['Dataset ID'] if org_id.lower() in axiom_id.lower().split('_')][0]
                
#                     except:
#                         dataset_id = None
                
                    dataset_ids.append(dataset_id)
                    
                self._dataset_ids = list(set(dataset_ids))
                
            else:
                logger_erd.warning('Neither stations nor region approach were used in function dataset_ids.')
                
            
        return self._dataset_ids
        
    
    def meta_by_dataset(self, dataset_id):

        info_url = self.e.get_info_url(response="csv", dataset_id=dataset_id)
        info = pd.read_csv(info_url)

        items = []

        for col in self.columns:

            try:
                item = info[info['Attribute Name'] == col]['Value'].values[0]
                dtype = info[info['Attribute Name'] == col]['Data Type'].values[0]
            except:
                if col == 'featureType':
                    # this column is not present in HF Radar metadata but want it to
                    # map to data_type, so input 'grid' in that case.
                    item = 'grid'
                else:
                    item = 'NA'

            if dtype == 'String':
                pass
            elif dtype == 'double':
                item = float(item)
            elif dtype == 'int':
                item = int(item)
            items.append(item)
            
#         if self.standard_names is not None:
#             # In case the variable is named differently from the standard names, 
#             # we back out the variable names here for each dataset. This also only 
#             # returns those names for which there is data in the dataset.
#             varnames = self.e.get_var_by_attr(
#                 dataset_id=dataset_id,
#                 standard_name=lambda v: v in self.standard_names
#             )
#         else:
#             varnames = None

        ## include download link ##
        self.e.dataset_id = dataset_id
        if self.e.protocol == 'tabledap':
            if self.variables is not None:
                self.e.variables = ["time","longitude", "latitude", "station"] + self.variables
            # set the same time restraints as before
            self.e.constraints = {'time<=': self.kw['max_time'], 'time>=': self.kw['min_time'],}
            download_url = self.e.get_download_url(response='csvp')

        elif self.e.protocol == 'griddap':
            # the search terms that can be input for tabledap do not work for griddap
            # in erddapy currently. Instead, put together an opendap link and then 
            # narrow the dataset with xarray.
            # get opendap link
            download_url = self.e.get_download_url(response='opendap')
        
        # add erddap server name
        return {dataset_id: [self.e.server, download_url] + items + [self.variables]}
    
      
    @property
    def meta(self):
        
        if not hasattr(self, '_meta'):
            
            if self.parallel:
            
                # get metadata for datasets
                # run in parallel to save time
                num_cores = multiprocessing.cpu_count()
                downloads = Parallel(n_jobs=num_cores)(
                    delayed(self.meta_by_dataset)(dataset_id) for dataset_id in self.dataset_ids
                )
                
            else:

                downloads = []
                for dataset_id in self.dataset_ids:
                    downloads.append(self.meta_by_dataset(dataset_id))

            # make dict from individual dicts
            from collections import ChainMap
            meta = dict(ChainMap(*downloads)) 

            # Make dataframe of metadata
            # variable names are the column names for the dataframe
            self._meta = pd.DataFrame.from_dict(meta, orient='index', 
                                                columns=['database','download_url'] \
                                                + self.columns + ['variable names'])
           
        return self._meta       
    
    
    def data_by_dataset(self, dataset_id):

        download_url = self.meta.loc[dataset_id, 'download_url']
        # data variables in ds that are not the variables we searched for
#         varnames = self.meta.loc[dataset_id, 'variable names']

        if self.e.protocol == 'tabledap':

            try:

                # fetch metadata if not already present
                # found download_url from metadata and use
                dd = pd.read_csv(download_url, index_col=0, parse_dates=True)
                
                # Drop cols and rows that are only NaNs.
                dd = dd.dropna(axis='index', how='all').dropna(axis='columns', how='all')

                if self.variables is not None:
                    # check to see if there is any actual data
                    # this is a bit convoluted because the column names are the variable names 
                    # plus units so can't match 1 to 1.
                    datacols = 0  # number of columns that represent data instead of metadata
                    for col in dd.columns:
                        datacols += [varname in col for varname in self.variables].count(True)
                    # if no datacols, we can skip this one.
                    if datacols == 0:
                        dd = None
                    
            except Exception as e:
                logger_erd.exception(e)
                logger_erd.warning('no data to be read in for %s' % dataset_id)
                dd = None
        
        elif self.e.protocol == 'griddap':

            try:
                dd = xr.open_dataset(download_url, chunks='auto').sel(time=slice(self.kw['min_time'],self.kw['max_time']))

                if ('min_lat' in self.kw) and ('max_lat' in self.kw):
                    dd = dd.sel(latitude=slice(self.kw['min_lat'],self.kw['max_lat']))

                if ('min_lon' in self.kw) and ('max_lon' in self.kw):
                    dd = dd.sel(longitude=slice(self.kw['min_lon'],self.kw['max_lon']))

                # use variable names to drop other variables (should. Ido this?)
                if self.variables is not None:
                    l = set(dd.data_vars) - set(self.variables)
                    dd = dd.drop_vars(l)
                
            except Exception as e:
                logger_erd.exception(e)
                logger_erd.warning('no data to be read in for %s' % dataset_id)
                dd = None
                
        return (dataset_id, dd)


    @property
    def data(self):
        
        if not hasattr(self, '_data'):
            
            if self.parallel:
                num_cores = multiprocessing.cpu_count()
                downloads = Parallel(n_jobs=num_cores)(
                    delayed(self.data_by_dataset)(dataset_id) for dataset_id in self.dataset_ids
                )
            else:
                downloads = []
                for dataset_id in self.dataset_ids:
                    downloads.append(self.data_by_dataset(dataset_id))

#             if downloads is not None:
            dds = {dataset_id: dd for (dataset_id, dd) in downloads}
#             else:
#                 dds = None

            self._data = dds

        return self._data
    
    
    def count(self,url):
        try:
            return len(pd.read_csv(url))    
        except:
            return np.nan

    
    def all_variables(self):
        '''Return a list of all possible variables.'''
        
        file_name_counts = f'erddap_variable_list_{self.known_server}.csv'
        
        if os.path.exists(file_name_counts):
            return pd.read_csv(file_name_counts, index_col='variable')
        else:
            # This took 10 min running in parallel for ioos
            # 2 min for coastwatch
            url = f'{self.e.server}/categorize/variableName/index.csv?page=1&itemsPerPage=100000'
            df = pd.read_csv(url)
#             counts = []
#             for url in df.URL:
#                 counts.append(self.count(url))
            num_cores = multiprocessing.cpu_count()
            counts = Parallel(n_jobs=num_cores)(
                delayed(self.count)(url) for url in df.URL
            )
            dfnew = pd.DataFrame()
            dfnew['variable'] = df['Category']
            dfnew['count'] = counts
            dfnew = dfnew.set_index('variable')
            # remove nans
            if (dfnew.isnull().sum() > 0).values:
                dfnew = dfnew[~dfnew.isnull().values].astype(int)
            dfnew.to_csv(file_name_counts)
        
        return dfnew


    def search_variables(self, variables):
        '''Find valid variables names to use.
        
        Call with `search_variables()` to return the list of possible names.
        Call with `search_variables('salinity')` to return relevant names.
        '''
        
        if not isinstance(variables, list):
            variables = [variables]
        
        # set up search for input variables
        search = f"(?i)"
        for variable in variables:
            search += f".*{variable}|"
        search = search.strip('|')

        r = re.compile(search)
        
        # just get the variable names
        df = self.all_variables()
        parameters = df.index

        matches = list(filter(r.match, parameters))

        # return parameters that match input variable strings
        return df.loc[matches].sort_values('count', ascending=False)
    
    
    def check_variables(self, variables, verbose=False):
        
        if not isinstance(variables, list):
            variables = [variables]
            
#         parameters = list(self.all_variables().keys())
        parameters = list(self.all_variables().index)
        
        # for a variable to exactly match a parameter 
        # this should equal 1
        count = []
        for variable in variables:
            count += [parameters.count(variable)]
        
        condition = np.allclose(count,1)
        
        assertion = f'The input variables are not exact matches to ok variables for known_server {self.known_server}. \
                     \nCheck all parameter group values with `ErddapReader().all_variables()` \
                     \nor search parameter group values with `ErddapReader().search_variables({variables})`.\
                     \n\n Try some of the following variables:\n{str(self.search_variables(variables))}'# \
#                      \nor run `ErddapReader().check_variables("{variables}")'
        assert condition, assertion
        
        if condition and verbose:
            print('all variables are matches!')
#%%

#tend = datetime(2019, 7, 27, 0, 0)
#tini = datetime(2019, 7, 28, 0, 0)

tini = datetime(2019, 9, 14, 0, 0)
tend = datetime(2019, 9, 15, 0, 0)


#%% Look for datasets in IOOS glider dac
print('Looking for glider data sets')
e = ERDDAP(server = url_glider)

# Grab every dataset available
datasets = pd.read_csv(e.get_search_url(response='csv', search_for='all'))

# Search constraints
kw = {
    'min_lon': lon_lim[0],
    'max_lon': lon_lim[1],
    'min_lat': lat_lim[0],
    'max_lat': lat_lim[1],
    'min_time': tini.strftime('%Y-%m-%dT%H:%M:%SZ'),
    'max_time': tend.strftime('%Y-%m-%dT%H:%M:%SZ'),
}

search_url = e.get_search_url(response='csv', **kw)
#print(search_url)

# Grab the results
示例#20
0
def GOFS_RTOFS_vs_Argo_floats(lon_forec_track, lat_forec_track, lon_forec_cone,
                              lat_forec_cone, lon_best_track, lat_best_track,
                              lon_lim, lat_lim, folder_fig):
    #%% User input

    #GOFS3.1 output model location
    url_GOFS_ts = 'http://tds.hycom.org/thredds/dodsC/GLBy0.08/expt_93.0/ts3z'

    # RTOFS files
    folder_RTOFS = '/home/coolgroup/RTOFS/forecasts/domains/hurricanes/RTOFS_6hourly_North_Atlantic/'

    nc_files_RTOFS = ['rtofs_glo_3dz_f006_6hrly_hvr_US_east.nc',\
                      'rtofs_glo_3dz_f012_6hrly_hvr_US_east.nc',\
                      'rtofs_glo_3dz_f018_6hrly_hvr_US_east.nc',\
                      'rtofs_glo_3dz_f024_6hrly_hvr_US_east.nc']

    # COPERNICUS MARINE ENVIRONMENT MONITORING SERVICE (CMEMS)
    url_cmems = 'http://nrt.cmems-du.eu/motu-web/Motu'
    service_id = 'GLOBAL_ANALYSIS_FORECAST_PHY_001_024-TDS'
    product_id = 'global-analysis-forecast-phy-001-024'
    depth_min = '0.493'
    out_dir = '/home/aristizabal/crontab_jobs'

    # Bathymetry file
    #bath_file = '/Users/aristizabal/Desktop/MARACOOS_project/Maria_scripts/nc_files/GEBCO_2014_2D_-100.0_0.0_-60.0_45.0.nc'
    bath_file = '/home/aristizabal/bathymetry_files/GEBCO_2014_2D_-100.0_0.0_-10.0_50.0.nc'

    # Argo floats
    url_Argo = 'http://www.ifremer.fr/erddap'

    #%%

    from matplotlib import pyplot as plt
    import numpy as np
    import xarray as xr
    import netCDF4
    from datetime import datetime, timedelta
    import cmocean
    import matplotlib.dates as mdates
    from erddapy import ERDDAP
    import pandas as pd
    import os

    # Do not produce figures on screen
    plt.switch_backend('agg')

    # Increase fontsize of labels globally
    plt.rc('xtick', labelsize=14)
    plt.rc('ytick', labelsize=14)
    plt.rc('legend', fontsize=14)

    #%% Reading bathymetry data

    ncbath = xr.open_dataset(bath_file)
    bath_lat = ncbath.variables['lat'][:]
    bath_lon = ncbath.variables['lon'][:]
    bath_elev = ncbath.variables['elevation'][:]

    oklatbath = np.logical_and(bath_lat >= lat_lim[0], bath_lat <= lat_lim[-1])
    oklonbath = np.logical_and(bath_lon >= lon_lim[0], bath_lon <= lon_lim[-1])

    bath_latsub = bath_lat[oklatbath]
    bath_lonsub = bath_lon[oklonbath]
    bath_elevs = bath_elev[oklatbath, :]
    bath_elevsub = bath_elevs[:, oklonbath]

    #%% Get time bounds for current day
    #ti = datetime.today()
    ti = datetime.today() - timedelta(1) - timedelta(hours=6)
    tini = datetime(ti.year, ti.month, ti.day)
    te = ti + timedelta(2)
    tend = datetime(te.year, te.month, te.day)

    #%% Look for Argo datasets

    e = ERDDAP(server=url_Argo)

    # Grab every dataset available
    #datasets = pd.read_csv(e.get_search_url(response='csv', search_for='all'))

    kw = {
        'min_lon': lon_lim[0],
        'max_lon': lon_lim[1],
        'min_lat': lat_lim[0],
        'max_lat': lat_lim[1],
        'min_time': str(tini),
        'max_time': str(tend),
    }

    search_url = e.get_search_url(response='csv', **kw)

    # Grab the results
    search = pd.read_csv(search_url)

    # Extract the IDs
    dataset = search['Dataset ID'].values

    msg = 'Found {} Datasets:\n\n{}'.format
    print(msg(len(dataset), '\n'.join(dataset)))

    dataset_type = dataset[0]

    constraints = {
        'time>=': str(tini),
        'time<=': str(tend),
        'latitude>=': lat_lim[0],
        'latitude<=': lat_lim[1],
        'longitude>=': lon_lim[0],
        'longitude<=': lon_lim[1],
    }

    variables = [
        'platform_number',
        'time',
        'pres',
        'longitude',
        'latitude',
        'temp',
        'psal',
    ]

    e = ERDDAP(server=url_Argo, protocol='tabledap', response='nc')

    e.dataset_id = dataset_type
    e.constraints = constraints
    e.variables = variables

    print(e.get_download_url())

    df = e.to_pandas(
        parse_dates=True,
        skiprows=(1, )  # units information can be dropped.
    ).dropna()

    argo_ids = np.asarray(df['platform_number'])
    argo_times = np.asarray(df['time (UTC)'])
    argo_press = np.asarray(df['pres (decibar)'])
    argo_lons = np.asarray(df['longitude (degrees_east)'])
    argo_lats = np.asarray(df['latitude (degrees_north)'])
    argo_temps = np.asarray(df['temp (degree_Celsius)'])
    argo_salts = np.asarray(df['psal (PSU)'])

    #%% GOGF 3.1

    try:
        GOFS_ts = xr.open_dataset(url_GOFS_ts, decode_times=False)

        lt_GOFS = np.asarray(GOFS_ts['lat'][:])
        ln_GOFS = np.asarray(GOFS_ts['lon'][:])
        tt = GOFS_ts['time']
        t_GOFS = netCDF4.num2date(tt[:], tt.units)
        depth_GOFS = np.asarray(GOFS_ts['depth'][:])
    except Exception as err:
        print(err)
        GOFS_ts = np.nan
        lt_GOFS = np.nan
        ln_GOFS = np.nan
        depth_GOFS = np.nan
        t_GOFS = ti

    #%% Map Argo floats

    lev = np.arange(-9000, 9100, 100)
    plt.figure()
    plt.contourf(bath_lonsub,
                 bath_latsub,
                 bath_elevsub,
                 lev,
                 cmap=cmocean.cm.topo)
    plt.plot(lon_forec_track, lat_forec_track, '.-', color='gold')
    plt.plot(lon_forec_cone, lat_forec_cone, '.-b', markersize=1)
    plt.plot(lon_best_track, lat_best_track, 'or', markersize=3)

    argo_idd = np.unique(argo_ids)
    for i, id in enumerate(argo_idd):
        okind = np.where(argo_ids == id)[0]
        plt.plot(np.unique(argo_lons[okind]),
                 np.unique(argo_lats[okind]),
                 's',
                 color='darkorange',
                 markersize=5,
                 markeredgecolor='k')

    plt.title('Argo Floats ' + str(tini)[0:13] + '-' + str(tend)[0:13],
              fontsize=16)
    plt.axis('scaled')
    plt.xlim(lon_lim[0], lon_lim[1])
    plt.ylim(lat_lim[0], lat_lim[1])

    file = folder_fig + 'ARGO_lat_lon'
    #file = folder_fig + 'ARGO_lat_lon_' + str(np.unique(argo_times)[0])[0:10]
    plt.savefig(file, bbox_inches='tight', pad_inches=0.1)

    #%% Figure argo float vs GOFS and vs RTOFS

    argo_idd = np.unique(argo_ids)

    for i, id in enumerate(argo_idd):
        print(id)
        okind = np.where(argo_ids == id)[0]
        argo_time = np.asarray([
            datetime.strptime(t, '%Y-%m-%dT%H:%M:%SZ')
            for t in argo_times[okind]
        ])

        argo_lon = argo_lons[okind]
        argo_lat = argo_lats[okind]
        argo_pres = argo_press[okind]
        argo_temp = argo_temps[okind]
        argo_salt = argo_salts[okind]

        # GOFS
        print('Retrieving variables from GOFS')
        if isinstance(GOFS_ts, float):
            temp_GOFS = np.nan
            salt_GOFS = np.nan
        else:
            #oktt_GOFS = np.where(t_GOFS >= argo_time[0])[0][0]
            ttGOFS = np.asarray([
                datetime(t_GOFS[i].year, t_GOFS[i].month, t_GOFS[i].day,
                         t_GOFS[i].hour) for i in np.arange(len(t_GOFS))
            ])
            tstamp_GOFS = [
                mdates.date2num(ttGOFS[i]) for i in np.arange(len(ttGOFS))
            ]
            oktt_GOFS = np.unique(
                np.round(
                    np.interp(mdates.date2num(argo_time[0]), tstamp_GOFS,
                              np.arange(len(tstamp_GOFS)))).astype(int))[0]
            oklat_GOFS = np.where(lt_GOFS >= argo_lat[0])[0][0]
            oklon_GOFS = np.where(ln_GOFS >= argo_lon[0] + 360)[0][0]
            temp_GOFS = np.asarray(GOFS_ts['water_temp'][oktt_GOFS, :,
                                                         oklat_GOFS,
                                                         oklon_GOFS])
            salt_GOFS = np.asarray(GOFS_ts['salinity'][oktt_GOFS, :,
                                                       oklat_GOFS, oklon_GOFS])

        # RTOFS
        #Time window
        year = int(argo_time[0].year)
        month = int(argo_time[0].month)
        day = int(argo_time[0].day)
        tini = datetime(year, month, day)
        tend = tini + timedelta(days=1)

        # Read RTOFS grid and time
        print('Retrieving coordinates from RTOFS')

        if tini.month < 10:
            if tini.day < 10:
                fol = 'rtofs.' + str(tini.year) + '0' + str(
                    tini.month) + '0' + str(tini.day)
            else:
                fol = 'rtofs.' + str(tini.year) + '0' + str(tini.month) + str(
                    tini.day)
        else:
            if tini.day < 10:
                fol = 'rtofs.' + str(tini.year) + str(tini.month) + '0' + str(
                    tini.day)
            else:
                fol = 'rtofs.' + str(tini.year) + str(tini.month) + str(
                    tini.day)

        ncRTOFS = xr.open_dataset(folder_RTOFS + fol + '/' + nc_files_RTOFS[0])
        latRTOFS = np.asarray(ncRTOFS.Latitude[:])
        lonRTOFS = np.asarray(ncRTOFS.Longitude[:])
        depth_RTOFS = np.asarray(ncRTOFS.Depth[:])

        tRTOFS = []
        for t in np.arange(len(nc_files_RTOFS)):
            ncRTOFS = xr.open_dataset(folder_RTOFS + fol + '/' +
                                      nc_files_RTOFS[t])
            tRTOFS.append(np.asarray(ncRTOFS.MT[:])[0])

        tRTOFS = np.asarray([mdates.num2date(mdates.date2num(tRTOFS[t])) \
                  for t in np.arange(len(nc_files_RTOFS))])

        oktt_RTOFS = np.where(
            mdates.date2num(tRTOFS) >= mdates.date2num(argo_time[0]))[0][0]
        oklat_RTOFS = np.where(latRTOFS[:, 0] >= argo_lat[0])[0][0]
        oklon_RTOFS = np.where(lonRTOFS[0, :] >= argo_lon[0])[0][0]

        nc_file = folder_RTOFS + fol + '/' + nc_files_RTOFS[oktt_RTOFS]
        ncRTOFS = xr.open_dataset(nc_file)
        #time_RTOFS = tRTOFS[oktt_RTOFS]
        temp_RTOFS = np.asarray(ncRTOFS.variables['temperature'][0, :,
                                                                 oklat_RTOFS,
                                                                 oklon_RTOFS])
        salt_RTOFS = np.asarray(ncRTOFS.variables['salinity'][0, :,
                                                              oklat_RTOFS,
                                                              oklon_RTOFS])
        #lon_RTOFS = lonRTOFS[0,oklon_RTOFS]
        #lat_RTOFS = latRTOFS[oklat_RTOFS,0]

        # Downloading and reading Copernicus output
        motuc = 'python -m motuclient --motu ' + url_cmems + \
        ' --service-id ' + service_id + \
        ' --product-id ' + product_id + \
        ' --longitude-min ' + str(argo_lon[0]-2/12) + \
        ' --longitude-max ' + str(argo_lon[0]+2/12) + \
        ' --latitude-min ' + str(argo_lat[0]-2/12) + \
        ' --latitude-max ' + str(argo_lat[0]+2/12) + \
        ' --date-min ' + '"' + str(tini-timedelta(0.5)) + '"' + \
        ' --date-max ' + '"' + str(tend+timedelta(0.5)) + '"' + \
        ' --depth-min ' + depth_min + \
        ' --depth-max ' + str(np.nanmax(argo_pres)+1000) + \
        ' --variable ' + 'thetao' + ' ' + \
        ' --variable ' + 'so'  + ' ' + \
        ' --out-dir ' + out_dir + \
        ' --out-name ' + str(id) + '.nc' + ' ' + \
        ' --user ' + 'maristizabalvar' + ' ' + \
        ' --pwd ' +  'MariaCMEMS2018'

        os.system(motuc)
        # Check if file was downloaded

        COP_file = out_dir + '/' + str(id) + '.nc'
        # Check if file was downloaded
        resp = os.system('ls ' + out_dir + '/' + str(id) + '.nc')
        if resp == 0:
            COP = xr.open_dataset(COP_file)

            latCOP = np.asarray(COP.latitude[:])
            lonCOP = np.asarray(COP.longitude[:])
            depth_COP = np.asarray(COP.depth[:])
            tCOP = np.asarray(mdates.num2date(mdates.date2num(COP.time[:])))
        else:
            latCOP = np.empty(1)
            latCOP[:] = np.nan
            lonCOP = np.empty(1)
            lonCOP[:] = np.nan
            tCOP = np.empty(1)
            tCOP[:] = np.nan

        oktimeCOP = np.where(
            mdates.date2num(tCOP) >= mdates.date2num(tini))[0][0]
        oklonCOP = np.where(lonCOP >= argo_lon[0])[0][0]
        oklatCOP = np.where(latCOP >= argo_lat[0])[0][0]

        temp_COP = np.asarray(COP.variables['thetao'][oktimeCOP, :, oklatCOP,
                                                      oklonCOP])
        salt_COP = np.asarray(COP.variables['so'][oktimeCOP, :, oklatCOP,
                                                  oklonCOP])

        # Figure temp
        plt.figure(figsize=(5, 6))
        plt.plot(argo_temp,
                 -argo_pres,
                 '.-',
                 linewidth=2,
                 label='ARGO Float id ' + str(id))
        plt.plot(temp_GOFS,
                 -depth_GOFS,
                 '.-',
                 linewidth=2,
                 label='GOFS 3.1',
                 color='red')
        plt.plot(temp_RTOFS,
                 -depth_RTOFS,
                 '.-',
                 linewidth=2,
                 label='RTOFS',
                 color='g')
        plt.plot(temp_COP,
                 -depth_COP,
                 '.-',
                 linewidth=2,
                 label='Copernicus',
                 color='darkorchid')
        plt.ylim([-1000, 0])
        plt.title('Temperature Profile on '+ str(argo_time[0])[0:13] +
                  '\n [lon,lat] = [' \
                  + str(np.round(argo_lon[0],3)) +',' +\
                      str(np.round(argo_lat[0],3))+']',\
                      fontsize=16)
        plt.ylabel('Depth (m)', fontsize=14)
        plt.xlabel('$^oC$', fontsize=14)
        plt.legend(loc='lower right', fontsize=14)

        file = folder_fig + 'ARGO_vs_GOFS_RTOFS_COP_temp_' + str(id)
        plt.savefig(file, bbox_inches='tight', pad_inches=0.1)

        # Figure salt
        plt.figure(figsize=(5, 6))
        plt.plot(argo_salt,
                 -argo_pres,
                 '.-',
                 linewidth=2,
                 label='ARGO Float id ' + str(id))
        plt.plot(salt_GOFS,
                 -depth_GOFS,
                 '.-',
                 linewidth=2,
                 label='GOFS 3.1',
                 color='red')
        plt.plot(salt_RTOFS,
                 -depth_RTOFS,
                 '.-',
                 linewidth=2,
                 label='RTOFS',
                 color='g')
        plt.plot(salt_COP,
                 -depth_COP,
                 '.-',
                 linewidth=2,
                 label='Copernicus',
                 color='darkorchid')
        plt.ylim([-1000, 0])
        plt.title('Salinity Profile on '+ str(argo_time[0])[0:13] +
                  '\n [lon,lat] = [' \
                  + str(np.round(argo_lon[0],3)) +',' +\
                      str(np.round(argo_lat[0],3))+']',\
                      fontsize=16)
        plt.ylabel('Depth (m)', fontsize=14)
        plt.legend(loc='lower right', fontsize=14)

        file = folder_fig + 'ARGO_vs_GOFS_RTOFS_COP_salt_' + str(id)
        plt.savefig(file, bbox_inches='tight', pad_inches=0.1)