Пример #1
0
def dosta_ctdbp_datalogger(ds):
    """
    Takes data from DOSTAs connected to the CTDBP, with the data recorded by
    the datalogger (used by some of the EA moorings) and cleans up the data 
    set to make it more user-friendly. Primary task is renaming the alphabet 
    soup parameter names and dropping some parameters that are of limited or 
    no use/value.

    :param ds: initial dosta data set downloaded from OOI via the M2M system
    :return ds: cleaned up data set
    """
    # drop some of the variables:
    #   dcl_controller_timestamp == time, redundant so can remove
    #   date_time_string == internal_timestamp, redundant so can remove
    ds = ds.drop(['dcl_controller_timestamp', 'date_time_string'])

    # convert the time values from a datetime64[ns] object to a floating point number with the time in seconds
    ds['internal_timestamp'] = ('time', dt64_epoch(ds.internal_timestamp))
    ds['internal_timestamp'].attrs = dict({
        'long_name':
        'Internal CTD Clock Time',
        'standard_name':
        'time',
        'units':
        'seconds since 1970-01-01 00:00:00 0:00',
        'calendar':
        'gregorian',
        'comment':
        ('Comparing the instrument internal clock versus the GPS referenced sampling time will allow for '
         'calculations of the instrument clock offset and drift. Useful when working with the '
         'recovered instrument data where no external GPS referenced clock is available.'
         )
    })

    # rename some of the variables for better clarity
    rename = {
        'dosta_ln_optode_oxygen': 'oxygen_concentration',
        'dosta_ln_optode_oxygen_qc_executed':
        'oxygen_concentration_qc_executed',
        'dosta_ln_optode_oxygen_qc_results': 'oxygen_concentration_qc_results',
        'dissolved_oxygen': 'oxygen_concentration_corrected',
        'dissolved_oxygen_qc_executed':
        'oxygen_concentration_corrected_qc_executed',
        'dissolved_oxygen_qc_results':
        'oxygen_concentration_corrected_qc_results',
        'int_ctd_pressure': 'seawater_pressure',
        'temp': 'seawater_temperature',
    }
    ds = ds.rename(rename)

    # reset variable attributes using dictionary above
    for v in ds.variables:
        if v in ATTRS.keys():
            ds[v].attrs = ATTRS[v]

    # add original OOINet variable name as an attribute if renamed
    for key, value in rename.items():
        ds[value].attrs['ooinet_variable_name'] = key

    return ds
Пример #2
0
def combine_datasets(tdata, rhdata, ridata, resample_time):
    """
    Load and merge data from telemetered, recovered host and recovered
    instrument data sets. Telemetered and recovered host data represent the
    same source of data, just different data delivery methods. These data files
    are concatenated together and only unique time records are kept. The
    recovered instrument data is concatenated onto the telemetered/recovered
    host data set and then the full data set is resampled to a common time
    record via median averaging. The resulting merged and resampled data set is
    returned for further analysis.

    :param tdata: telemetered data as an xarray data set or None if no data available
    :param rhdata: recovered host data as an xarray data set or None if no data available
    :param ridata: recovered instrument data as xarray data set or None if no data available
    :param resample_time: The resampling time period in minutes
    :return ds: The combined and resampled data set
    """
    # combine the telemetered and recovered host datasets, which have the same variables
    if tdata and rhdata:
        # use concat to join the datasets and then select only unique time points
        ds = xr.concat([tdata, rhdata], 'time')
        _, index = np.unique(ds['time'], return_index=True)
        ds = ds.isel(time=index)
    elif tdata and not rhdata:
        # telemetered data, but no recovered host data
        ds = tdata
    elif rhdata and not tdata:
        # recovered host data, but no telemetered data
        ds = rhdata
    else:
        # no telemetered or recovered host data
        ds = None

    # combine the recovered instrument data with the telemetered/recovered host data, if both exists
    if ds and ridata:
        # first, identify any variables in ds that are not available in ridata
        for v in ds.variables:
            if v not in ridata.variables:
                # add an empty variable of the same type and dimensions to ridata
                if len(ds[v].dims) == 1:
                    ridata[v] = ds[v].broadcast_like(ridata['station'])
                else:
                    ridata[v] = ds[v].broadcast_like(ridata['time'])

        # next, identify any variables in ridata that are not available in ds
        for v in ridata.variables:
            if v not in ds.variables:
                # add an empty variable of the same type and dimensions to ridata
                if len(ridata[v].dims) == 1:
                    ds[v] = ridata[v].broadcast_like(ridata['station'])
                else:
                    ds[v] = ridata[v].broadcast_like(ridata['time'])

        # finally, concat the datasets and remove any duplicate timestamps
        ds = xr.concat([ds, ridata], 'time')
    elif ds and not ridata:
        pass
    elif ridata and not ds:
        # no telemetered/recovered host data, just the recovered instrument data.
        ds = ridata
    else:
        return None

    # resample the dataset onto a common time record
    itime = '{:d}Min'.format(resample_time)
    gtime = '{:d}Min'.format(resample_time * 3)
    ds = ds.sortby('time')
    avg = ds.resample(time=itime, keep_attrs=True).median()
    avg = avg.interpolate_na(dim='time', max_gap=gtime)

    # reset the time record to seconds since 1970
    avg['time'] = dt64_epoch(avg.time)

    # add the attributes back into the data set
    avg.attrs = ds.attrs
    for v in avg.variables:
        if v != 'time':
            avg[v] = avg[v].astype(ds[v].dtype)
            avg[v].attrs = ds[v].attrs

    avg.time.attrs['long_name'] = 'Time'
    avg.time.attrs['standard_name'] = 'time'
    avg.time.attrs['axis'] = 'T'
    avg.time.attrs['units'] = 'seconds since 1970-01-01 00:00:00 0:00'
    avg.time.attrs['calendar'] = 'gregorian'

    return avg
Пример #3
0
def phsen_streamed(ds):
    """
    Takes PHSEN data streamed from instruments deployed by the Regional Cabled
    Array and cleans up the data set to make it more user-friendly. Primary
    task is renaming parameters and dropping some that are of limited use.
    Additionally, re-organize some of the variables to permit better assessments
    of the data.

    :param ds: initial PHSEN data set recorded by the data logger system and
        downloaded from OOI via the M2M system
    :return: cleaned up and reorganized data set
    """
    # drop some of the variables:
    #   checksum == not used
    #   record_type == not used
    #   record_length == not used
    #   signal_intensity_434, part of the light measurements array, redundant so can remove
    #   signal_intensity_578, part of the light measurements array, redundant so can remove
    ds = ds.reset_coords()
    ds = ds.drop([
        'checksum', 'record_type', 'record_length', 'signal_intensity_434',
        'signal_intensity_578'
    ])

    # convert the internal_timestamp values from a datetime64[ns] object to a floating point number with the time in
    # seconds, replacing the internal_timestamp with the record_time (the internal_timestamp is incorrectly set in the
    # NetCDF file).
    ds['internal_timestamp'] = ('time', dt64_epoch(ds.record_time))
    ds['internal_timestamp'].attrs = dict({
        'long_name':
        'Internal SAMI-pH Clock Time',
        'standard_name':
        'time',
        'units':
        'seconds since 1970-01-01 00:00:00 0:00',
        'calendar':
        'gregorian',
        'comment':
        ('Comparing the instrument internal clock versus the GPS referenced sampling time will allow for '
         +
         'calculations of the instrument clock offset and drift. Useful when working with the '
         +
         'recovered instrument data where no external GPS referenced clock is available.'
         )
    })
    ds = ds.drop(['record_time'])

    # rename some of the variables for better clarity
    rename = {
        'voltage_battery': 'raw_battery_voltage',
        'thermistor_start': 'raw_thermistor_start',
        'thermistor_end': 'raw_thermistor_end',
        'phsen_thermistor_temperature': 'thermistor_temperature',
        'phsen_battery_volts': 'battery_voltage',
        'ph_seawater': 'seawater_ph',
        'ph_seawater_qc_executed': 'seawater_ph_qc_executed',
        'ph_seawater_qc_results': 'seawater_ph_qc_results'
    }
    ds = ds.rename(rename)

    # now we need to reset the light and reference arrays to named variables that will be more meaningful and useful in
    # the final data files
    nrec = len(ds['time'].values)
    light = np.array(np.vstack(ds['ph_light_measurements'].values),
                     dtype='int32')
    light = np.atleast_3d(light)
    light = np.reshape(light,
                       (nrec, 23, 4))  # 4 sets of 23 seawater measurements
    reference_434 = light[:, :, 0]  # reference signal, 434 nm
    signal_434 = light[:, :, 1]  # signal intensity, 434 nm (PH434SI_L0)
    reference_578 = light[:, :, 2]  # reference signal, 578 nm
    signal_578 = light[:, :, 3]  # signal intensity, 578 nm (PH578SI_L0)

    refnc = np.array(np.vstack(ds['reference_light_measurements'].values),
                     dtype='int32')
    refnc = np.atleast_3d(refnc)
    refnc = np.reshape(
        refnc, (nrec, 4, 4))  # 4 sets of 4 DI water measurements (blanks)
    blank_refrnc_434 = refnc[:, :, 0]  # DI blank reference, 434 nm
    blank_signal_434 = refnc[:, :, 1]  # DI blank signal, 434 nm
    blank_refrnc_578 = refnc[:, :, 2]  # DI blank reference, 578 nm
    blank_signal_578 = refnc[:, :, 3]  # DI blank signal, 578 nm

    # create a data set with the reference and light measurements
    ph = xr.Dataset(
        {
            'blank_refrnc_434':
            (['time', 'blanks'], blank_refrnc_434.astype('int32')),
            'blank_signal_434':
            (['time', 'blanks'], blank_signal_434.astype('int32')),
            'blank_refrnc_578':
            (['time', 'blanks'], blank_refrnc_578.astype('int32')),
            'blank_signal_578':
            (['time', 'blanks'], blank_signal_578.astype('int32')),
            'reference_434':
            (['time', 'measurements'], reference_434.astype('int32')),
            'signal_434':
            (['time', 'measurements'], signal_434.astype('int32')),
            'reference_578':
            (['time', 'measurements'], reference_578.astype('int32')),
            'signal_578':
            (['time', 'measurements'], signal_578.astype('int32'))
        },
        coords={
            'time': ds['time'],
            'measurements': np.arange(0, 23).astype('int32'),
            'blanks': np.arange(0, 4).astype('int32')
        })
    ds = ds.drop([
        'ph_light_measurements', 'reference_light_measurements',
        'ph_light_measurements_dim_0', 'reference_light_measurements_dim_0'
    ])

    # merge the data sets back together
    ds = ds.merge(ph)

    # reset some of the variable attributes, and ...
    for v in ds.variables:  # variable level attributes
        if v in PHSEN:
            ds[v].attrs = PHSEN[v]

    # ... add the renamed information
    for key, value in rename.items():
        ds[value].attrs['ooinet_variable_name'] = key

    # and reset some of the data types
    data_types = [
        'deployment', 'raw_thermistor_end', 'raw_thermistor_start',
        'unique_id', 'raw_battery_voltage'
    ]
    for v in data_types:
        ds[v] = ds[v].astype('int32')

    return ds
def phsen_datalogger(ds):
    """
    Takes PHSEN data recorded by the data loggers used in the CGSN/EA moorings
    and cleans up the data set to make it more user-friendly. Primary task is
    renaming parameters and dropping some that are of limited use. Additionally,
    re-organize some of the variables to permit better assessments of the data.

    :param ds: initial PHSEN data set recorded by the data logger system and
        downloaded from OOI via the M2M system
    :return: cleaned up and reorganized data set
    """
    # drop some of the variables:
    #   passed_checksum == not used
    #   record_type == not used
    #   record_time == not used
    #   dcl_controller_timestamp == time, redundant so can remove
    #   phsen_abcdef_signal_intensity_434, part of the light measurements array, redundant so can remove
    #   phsen_abcdef_signal_intensity_578, part of the light measurements array, redundant so can remove
    ds = ds.reset_coords()
    ds = ds.drop([
        'passed_checksum', 'record_type', 'record_time',
        'phsen_abcdef_signal_intensity_434',
        'phsen_abcdef_signal_intensity_578', 'dcl_controller_timestamp'
    ])

    # convert the time values from a datetime64[ns] object to a floating point number with the time in seconds
    ds['internal_timestamp'] = ('time', dt64_epoch(ds.internal_timestamp))
    ds['internal_timestamp'].attrs = dict({
        'long_name':
        'Internal SAMI-pH Clock Time',
        'standard_name':
        'time',
        'units':
        'seconds since 1970-01-01 00:00:00 0:00',
        'calendar':
        'gregorian',
        'comment':
        ('Comparing the instrument internal clock versus the GPS referenced sampling time will allow for '
         +
         'calculations of the instrument clock offset and drift. Useful when working with the '
         +
         'recovered instrument data where no external GPS referenced clock is available.'
         )
    })

    # rename some of the variables for better clarity
    rename = {
        'voltage_battery': 'raw_battery_voltage',
        'thermistor_start': 'raw_thermistor_start',
        'thermistor_end': 'raw_thermistor_end',
        'phsen_thermistor_temperature': 'thermistor_temperature',
        'phsen_abcdef_ph_seawater': 'seawater_ph',
        'phsen_abcdef_ph_seawater_qc_executed': 'seawater_ph_qc_executed',
        'phsen_abcdef_ph_seawater_qc_results': 'seawater_ph_qc_results'
    }
    ds = ds.rename(rename)

    # now we need to reset the light and reference arrays to named variables that will be more meaningful and useful in
    # the final data files
    nrec = len(ds['time'].values)
    light = np.array(np.vstack(ds['light_measurements'].values), dtype='int32')
    light = np.atleast_3d(light)
    light = np.reshape(light,
                       (nrec, 23, 4))  # 4 sets of 23 seawater measurements
    reference_434 = light[:, :, 0]  # reference signal, 434 nm
    signal_434 = light[:, :, 1]  # signal intensity, 434 nm (PH434SI_L0)
    reference_578 = light[:, :, 2]  # reference signal, 578 nm
    signal_578 = light[:, :, 3]  # signal intensity, 578 nm (PH578SI_L0)

    refnc = np.array(np.vstack(ds['reference_light_measurements'].values),
                     dtype='int32')
    refnc = np.atleast_3d(refnc)
    refnc = np.reshape(
        refnc, (nrec, 4, 4))  # 4 sets of 4 DI water measurements (blanks)
    blank_refrnc_434 = refnc[:, :, 0]  # DI blank reference, 434 nm
    blank_signal_434 = refnc[:, :, 1]  # DI blank signal, 434 nm
    blank_refrnc_578 = refnc[:, :, 2]  # DI blank reference, 578 nm
    blank_signal_578 = refnc[:, :, 3]  # DI blank signal, 578 nm

    # create a data set with the reference and light measurements
    ph = xr.Dataset(
        {
            'blank_refrnc_434':
            (['time', 'blanks'], blank_refrnc_434.astype('int32')),
            'blank_signal_434':
            (['time', 'blanks'], blank_signal_434.astype('int32')),
            'blank_refrnc_578':
            (['time', 'blanks'], blank_refrnc_578.astype('int32')),
            'blank_signal_578':
            (['time', 'blanks'], blank_signal_578.astype('int32')),
            'reference_434':
            (['time', 'measurements'], reference_434.astype('int32')),
            'signal_434':
            (['time', 'measurements'], signal_434.astype('int32')),
            'reference_578':
            (['time', 'measurements'], reference_578.astype('int32')),
            'signal_578':
            (['time', 'measurements'], signal_578.astype('int32'))
        },
        coords={
            'time': ds['time'],
            'measurements': np.arange(0, 23).astype('int32'),
            'blanks': np.arange(0, 4).astype('int32')
        })
    ds = ds.drop([
        'reference_light_measurements_dim_0', 'spectrum', 'light_measurements',
        'reference_light_measurements'
    ])

    # these two dimensional variables may or may not be present depending on how the data was requested.
    # remove them if they do exist so we can merge different data sets together
    maybe = [
        'phsen_abcdef_signal_intensity_434_dim_0',
        'phsen_abcdef_signal_intensity_578_dim_0'
    ]
    for k, v in ds.dims.items():
        if k in maybe:
            ds = ds.drop(k)

    # merge the data sets back together
    ds = ds.merge(ph)

    # test data quality
    ds['seawater_ph_quality_flag'] = quality_checks(ds)

    # reset some attributes
    for key, value in ATTRS.items():
        for atk, atv in value.items():
            if key in ds.variables:
                ds[key].attrs[atk] = atv

    # add the original variable name as an attribute, if renamed
    for key, value in rename.items():
        ds[value].attrs['ooinet_variable_name'] = key

    # and reset some of the data types
    data_types = [
        'deployment', 'raw_thermistor_end', 'raw_thermistor_start',
        'unique_id', 'raw_battery_voltage'
    ]
    for v in data_types:
        ds[v] = ds[v].astype('int32')

    return ds
def pco2a_datalogger(ds, burst=False):
    """
    Takes pco2a data recorded by the data loggers used in the CGSN/EA moorings
    and cleans up the data set to make it more user-friendly. Primary task is
    renaming parameters and dropping some that are of limited use. Additionally,
    re-organize some of the variables to permit better assessments of the data.

    :param ds: initial pco2a data set for the air measurements downloaded from
        OOI via the M2M system
    :param burst: resample the data to an hourly, burst averaged time interval
    :return ds: cleaned up data set
    """
    # drop some of the variables:
    #   ### OOI generated parameters
    #   date_time_string == internal_timestamp, redundant so can remove
    #   dcl_controller_timestamp == time, redundant so can remove
    #   supply_voltage == not used
    #   ### Data products from upstream processing used to calculate the normalized 10 m wind, but are not needed
    #   eastward_velocity
    #   northward_velocity
    #   air_temperature
    #   met_relwind_speed
    #   longwave_irradiance
    #   shortwave_irradiance
    #   relative_humidity
    #   barometric_pressure
    #   precipitation
    shared = ['date_time_string', 'dcl_controller_timestamp']
    if 'supply_voltage' in ds.variables:
        # ... because it is in the telemetered, but not the recovered_host ...
        shared.append('supply_voltage')

    ds = ds.drop(shared)

    # determine if the upstream parameters are present. delete them if needed, otherwise add the required ones to make
    # sure the NetCDF files are consistent
    upstream = [
        'eastward_velocity', 'northward_velocity', 'air_temperature',
        'met_relwind_speed', 'longwave_irradiance', 'shortwave_irradiance',
        'relative_humidity', 'barometric_pressure', 'precipitation'
    ]
    if 'eastward_velocity' in ds.variables:
        ds = ds.drop(upstream)
    else:
        # METBK data was missing, add variables below to keep data sets consistent
        ds['sea_surface_temperature'] = ('time', ds['deployment'] * np.nan)
        ds['sea_surface_temperature'].attrs = {
            'long_name':
            'Sea Surface Temperature',
            'standard_name':
            'sea_surface_temperature',
            'comment':
            ('Normally this would be sea surface temperature data from a co-located CTD. However, data '
             +
             'from that sensor is unavailable. This value has been filled with NaNs to preserve the '
             + 'structure of the data set.'),
            'units':
            'degree_Celsius',
            'data_product_identifier':
            'TEMPSRF_L1',
            'instrument': (ds.attrs['subsite'] + '-SBD11-06-METBKA000'),
            'stream':
            'metbk_a_dcl_instrument'
        }

        ds['met_salsurf'] = ('time', ds['deployment'] * np.nan)
        ds['met_salsurf'].attrs = {
            'long_name':
            'Sea Surface Practical Salinity',
            'standard_name':
            'sea_surface_salinity',
            'units':
            '1e-3',
            'comment':
            ('Normally this would be sea surface salinity data from a co-located CTD. However, data from '
             +
             'that sensor is unavailable. This value has been filled with NaNs to preserve the structure '
             + 'of the data set.'),
            'data_product_identifier':
            'SALSURF_L2',
            'instrument': (ds.attrs['subsite'] + '-SBD11-06-METBKA000'),
            'stream':
            'metbk_a_dcl_instrument'
        }

        ds['met_wind10m'] = ('time', ds['deployment'] * np.nan)
        ds['met_wind10m'].attrs = {
            'long_name':
            'Normalized Wind Speed at 10 m',
            'standard_name':
            'wind_speed',
            'units':
            'm s-1',
            'comment':
            ('Normally this would be the modelled wind speed at a reference height of 10 m from a '
             +
             'co-located wind sensor. However, data from that sensor is unavailable. This value has been '
             + 'filled with NaNs to preserve the structure of the data set.'),
            'data_product_identifier':
            'WIND10M_L2',
            'instrument': (ds.attrs['subsite'] + '-SBD11-06-METBKA000'),
            'stream':
            'metbk_hourly'
        }

    # drop the two QC tests applied to the L0 values (not supposed to happen)
    if re.match(r'.*_air.*', ds.attrs['stream']):
        # air stream
        ds = ds.drop(
            ['measured_air_co2_qc_executed', 'measured_air_co2_qc_results'])
    else:
        # water stream
        ds = ds.drop([
            'measured_water_co2_qc_executed', 'measured_water_co2_qc_results'
        ])

    # convert the internal timestamp values from a datetime64[ns] object to a floating point number, time in seconds
    ds['internal_timestamp'] = ('time', dt64_epoch(ds.internal_timestamp))
    ds['internal_timestamp'].attrs = dict({
        'long_name':
        'Internal pCO2-Pro Clock Time',
        'standard_name':
        'time',
        'units':
        'seconds since 1970-01-01 00:00:00 0:00',
        'calendar':
        'gregorian',
        'comment':
        ('Comparing the instrument internal clock versus the GPS referenced sampling time will allow for '
         + 'calculations of the instrument clock offset and drift.')
    })

    # rename variables to get a cleaner set variables and attributes
    rename = {
        'met_salsurf': 'sea_surface_salinity',
        'met_wind10m': 'normalized_10m_wind',
        'pco2_co2flux': 'sea_air_co2_flux',
        'pco2_co2flux_qc_executed': 'sea_air_co2_flux_qc_executed',
        'pco2_co2flux_qc_results': 'sea_air_co2_flux_qc_results'
    }
    ds = ds.rename(rename)
    for key, value in rename.items():  # bulk attribute update...
        ds[value].attrs['ooinet_variable_name'] = key

    ds['sea_air_co2_flux'].attrs['ancillary_variables'] = (
        'partial_pressure_co2_atm partial_pressure_co2_ssw ' +
        'sea_surface_temperature sea_surface_salinity ' +
        'normalized_10m_wind sea_air_co2_flux_qc_executed ' +
        'sea_air_co2_flux_qc_results')

    # reset incorrectly formatted temperature units
    temp_vars = [
        'sea_surface_temperature', 'avg_irga_temperature',
        'humidity_temperature', 'irga_detector_temperature',
        'irga_source_temperature'
    ]
    for var in temp_vars:
        ds[var].attrs['units'] = 'degree_Celsius'

    # reset incorrectly set attributes for salinity and wind speed
    ds['sea_surface_salinity'].attrs['standard_name'] = 'sea_surface_salinity'
    ds['sea_surface_salinity'].attrs[
        'long_name'] = 'Sea Surface Practical Salinity'
    ds['sea_surface_salinity'].attrs['units'] = '1e-3'
    ds['normalized_10m_wind'].attrs['standard_name'] = 'wind_speed'
    ds['normalized_10m_wind'].attrs[
        'long_name'] = 'Normalized Wind Speed at 10 m'

    if burst:  # re-sample the data to a defined time interval using a median average
        burst = ds  # make a copy of the original dataset
        burst['time'] = burst['time'].dt.round(
            'H')  # reset the time values to the nearest hour
        burst = burst.resample(
            time='1H', keep_attrs=True,
            skipna=True).median()  # median average the hourly bursts
        burst = burst.where(~np.isnan(burst.deployment), drop=True)

        # reset the attributes...which keep_attrs should do...
        burst.attrs = ds.attrs
        for v in burst.variables:
            burst[v].attrs = ds[v].attrs

        # save the newly average data
        ds = burst

    return ds
def ctdbp_datalogger(ds, burst=False):
    """
    Takes ctdbp data recorded by the data loggers used in the CGSN/EA moorings
    and cleans up the data set to make it more user-friendly. Primary task is
    renaming the alphabet soup parameter names and dropping some parameters
    that are of no use/value.

    :param ds: initial ctdbp data set downloaded from OOI via the M2M system
    :param burst: resample the data to the defined time interval
    :return ds: cleaned up data set
    """
    # drop some of the variables:
    #   dcl_controller_timestamp == time, redundant so can remove
    #   date_time_string = internal_timestamp, redundant so can remove
    ds = ds.reset_coords()
    ds = ds.drop(['dcl_controller_timestamp', 'date_time_string'])

    # convert the time values from a datetime64[ns] object to a floating point number with the time in seconds
    ds['internal_timestamp'] = ('time', dt64_epoch(ds.internal_timestamp))
    ds['internal_timestamp'].attrs = dict({
        'long_name':
        'Internal CTD Clock Time',
        'standard_name':
        'time',
        'units':
        'seconds since 1970-01-01 00:00:00 0:00',
        'calendar':
        'gregorian',
        'comment':
        ('Comparing the instrument internal clock versus the GPS referenced sampling time will allow for '
         +
         'calculations of the instrument clock offset and drift. Useful when working with the '
         +
         'recovered instrument data where no external GPS referenced clock is available.'
         )
    })

    # rename some of the variables for better clarity
    rename = {
        'temp': 'temperature',
        'temp_qc_executed': 'temperature_qc_executed',
        'temp_qc_results': 'temperature_qc_results'
    }
    ds = ds.rename(rename)
    for key, value in rename.items():
        ds[value].attrs['ooinet_variable_name'] = key

    # correct incorrect units
    ds['temperature'].attrs['units'] = 'degree_Celsius'

    # ancillary_variables attribute set incorrectly (should be a space separated list) for certain variables
    ds['temperature'].attrs[
        'ancillary_variables'] = 'temperature_qc_executed temperature_qc_results'
    ds['conductivity'].attrs[
        'ancillary_variables'] = 'conductivity_qc_executed conductivity_qc_results'
    ds['pressure'].attrs[
        'ancillary_variables'] = 'pressure_qc_executed pressure_qc_results'
    ds['practical_salinity'].attrs['ancillary_variables'] = (
        'conductivity temperature pressure ' +
        'practical_salinity_qc_executed ' + 'practical_salinity_qc_results')
    ds['density'].attrs['ancillary_variables'] = (
        'conductivity temperature pressure lat lon ' +
        'density_qc_executed density_qc_results')

    if burst:  # re-sample the data to a defined time interval using a median average
        # create the burst averaging
        burst = ds
        burst['time'] = burst['time'] - np.timedelta64(
            450, 's')  # center time windows for 15 minute bursts
        burst = burst.resample(time='15Min', keep_attrs=True,
                               skipna=True).median()
        burst = burst.where(~np.isnan(burst.deployment), drop=True)

        # reset the attributes...which keep_attrs should do...
        burst.attrs = ds.attrs
        for v in burst.variables:
            burst[v].attrs = ds[v].attrs

        # save the newly average data
        ds = burst

    return ds
def pco2w_datalogger(ds):
    """
    Takes PCO2W data recorded by the data loggers used in the CGSN/EA moorings
    and cleans up the data set to make it more user-friendly. Primary task is
    renaming parameters and dropping some that are of limited use. Additionally,
    re-organize some of the variables to permit better assessments of the data.

    :param ds: initial PCO2W data set recorded by the data logger system and
        downloaded from OOI via the M2M system
    :return: cleaned up and reorganized data set
    """
    # drop some of the variables:
    #   passed_checksum == not used
    #   record_type == not used
    #   record_time == not used
    #   dcl_controller_timestamp == time, redundant so can remove
    #   absorbance_ratio_*_qc_results == incorrectly set tests, ignoring
    #   absorbance_ratio_*_qc_executed == incorrectly set tests, ignoring
    ds = ds.reset_coords()
    ds = ds.drop([
        'passed_checksum', 'record_type', 'record_time',
        'dcl_controller_timestamp', 'absorbance_ratio_434_qc_results',
        'absorbance_ratio_434_qc_executed', 'absorbance_ratio_620_qc_results',
        'absorbance_ratio_620_qc_executed'
    ])

    # convert the time values from a datetime64[ns] object to a floating point number with the time in seconds
    ds['internal_timestamp'] = ('time', dt64_epoch(ds.internal_timestamp))
    ds['internal_timestamp'].attrs = dict({
        'long_name':
        'Internal SAMI-pCO2 Clock Time',
        'standard_name':
        'time',
        'units':
        'seconds since 1970-01-01 00:00:00 0:00',
        'calendar':
        'gregorian',
        'comment':
        ('Comparing the instrument internal clock versus the GPS referenced sampling time will allow for '
         'calculations of the instrument clock offset and drift. Useful when working with the '
         'recovered instrument data where no external GPS referenced clock is available.'
         )
    })

    # check for missing blank data, stripped from the record and treated as a co-located sensor.
    if 'absorbance_blank_434' not in ds.variables:
        ds['absorbance_blank_434'] = ('time', ds['deployment'] * 0 - 9999999)
        ds['absorbance_blank_620'] = ('time', ds['deployment'] * 0 - 9999999)

    # rename some of the variables for better clarity
    rename = {
        'voltage_battery':
        'raw_battery_voltage',
        'thermistor_raw':
        'raw_thermistor',
        'pco2w_thermistor_temperature':
        'thermistor_temperature',
        'pco2w_thermistor_temperature_qc_executed':
        'thermistor_temperature_qc_executed',
        'pco2w_thermistor_temperature_qc_results':
        'thermistor_temperature_qc_results',
    }
    ds = ds.rename(rename)

    # now we need to reset the light array to named variables that will be more meaningful and useful in
    # the final data files
    light = ds.light_measurements.astype('int32')
    dark_reference = light[:, [0, 8]].values  # dark reference
    dark_signal = light[:, [1, 9]].values  # dark signal
    reference_434 = light[:, [2, 10]].values  # reference signal, 434 nm
    signal_434 = light[:, [3, 11]].values  # signal intensity, 434 nm
    reference_620 = light[:, [4, 12]].values  # reference signal, 620 nm
    signal_620 = light[:, [5, 13]].values  # signal intensity, 620 nm

    # create a data set with the duplicate measurements for each variable
    data = xr.Dataset(
        {
            'dark_reference': (['time', 'duplicates'], dark_reference),
            'dark_signal': (['time', 'duplicates'], dark_signal),
            'reference_434': (['time', 'duplicates'], reference_434),
            'signal_434': (['time', 'duplicates'], signal_434),
            'reference_620': (['time', 'duplicates'], reference_620),
            'signal_620': (['time', 'duplicates'], signal_620)
        },
        coords={
            'time': ds['time'],
            'duplicates': np.arange(0, 2).astype('int32')
        })
    ds = ds.drop(['spectrum', 'light_measurements'])

    # merge the data sets back together
    ds = ds.merge(data)

    # calculate the battery voltage
    ds['battery_voltage'] = ds['raw_battery_voltage'] * 15. / 4096.

    # reset some of the data types
    data_types = [
        'deployment', 'raw_thermistor', 'unique_id', 'raw_battery_voltage',
        'absorbance_blank_434', 'absorbance_blank_620', 'absorbance_ratio_434',
        'absorbance_ratio_620'
    ]
    for v in data_types:
        ds[v] = ds[v].astype('int32')

    data_types = ['thermistor_temperature', 'pco2_seawater']
    for v in data_types:
        ds[v] = ds[v].astype('float32')

    # test the data quality
    ds['pco2_seawater_quality_flag'] = quality_checks(ds)

    # reset some attributes
    for key, value in ATTRS.items():
        for atk, atv in value.items():
            if key in ds.variables:
                ds[key].attrs[atk] = atv

    # add the original variable name as an attribute, if renamed
    for key, value in rename.items():
        ds[value].attrs['ooinet_variable_name'] = key

    return ds