def despike(var, window_size, spike_method="median"): """ Return a smooth baseline of data and the anomalous spikes This script is copied from Nathan Briggs' MATLAB script as described in Briggs et al (2011). It returns the baseline of the data using either a rolling window method and the residuals of [measurements - baseline]. Parameters ---------- arr: numpy.ndarray or pandas.Series Array of data variable for cleaning to be performed on. window_size: int the length of the rolling window size method: str A string with `minmax` or `median`. 'minmax' first applies a rolling minimum to the dataset thereafter a rolling maximum is applied. This forms the baseline, where the spikes are the difference from the baseline. 'median' first applies a rolling median to the dataset, which forms the baseline. The spikes are the difference between median and baseline, and thus are more likely to be negative. Returns ------- baseline: numpy.ndarray or pandas.Series The baseline from which outliers are determined. spikes: numpy.ndarray or pandas.Series Spikes are the residual of [measurements - baseline]. """ from numpy import array, isnan, nan, nanmax, nanmedian, nanmin, ndarray # convert to array arr = array(var) # create empty array for baseline baseline = ndarray(arr.shape) * nan # mask with exisiting nans masked out mask = ~isnan(arr) # if min-max method then get the rolling minimum and # then the rolling maximum if spike_method.startswith("min"): base_min = rolling_window(arr[mask], nanmin, window_size) base = rolling_window(base_min, nanmax, window_size) else: base = rolling_window(arr[mask], nanmedian, window_size) baseline[mask] = base spikes = arr - baseline baseline = transfer_nc_attrs(getframe(), var, baseline, "_baseline") spikes = transfer_nc_attrs(getframe(), var, spikes, "_spikes") return baseline, spikes
def par_scaling(par_uV, scale_factor_wet_uEm2s, sensor_output_mV): """ Scaling correction for par with factory calibration coefficients. The function subtracts the sensor output from the raw counts and divides with the scale factor. The factory calibrations are unique for each deployment and should be taken from the calibration file for that deployment. Parameters ---------- par_uV: numpy.ndarray or pandas.Series The raw par data with units uV. scale_factor_wet_uEm2s: float The scale factor from the factory calibration file in units uE/m2/sec. sensor_output_mV: float The sensor output in the dark from the factory calibration file in units mV. Returns par_uEm2s: numpy.ndarray or pandas.Series The par data corrected for the sensor output and scale factor from the factory calibration file in units uE/m2/sec. """ sensor_output_uV = sensor_output_mV / 1000.0 par_uEm2s = (par_uV - sensor_output_uV) / scale_factor_wet_uEm2s par_uEm2s = transfer_nc_attrs(getframe(), par_uV, par_uEm2s, 'par_uEm2s') return par_uEm2s
def potential_density(salt_PSU, temp_C, pres_db, lat, lon, pres_ref=0): """ Calculate density from glider measurements of salinity and temperature. The Basestation calculates density from absolute salinity and potential temperature. This function is a wrapper for this functionality, where potential temperature and absolute salinity are calculated first. Note that a reference pressure of 0 is used by default. Parameters ---------- salt_PSU : array, dtype=float, shape=[n, ] practical salinty temp_C : array, dtype=float, shape=[n, ] temperature in deg C pres_db : array, dtype=float, shape=[n, ] pressure in decibar lat : array, dtype=float, shape=[n, ] latitude in degrees north lon : array, dtype=float, shape=[n, ] longitude in degrees east Returns ------- potential_density : array, dtype=float, shape=[n, ] Note ---- Using seawater.dens does not yield the same results as this function. We get very close results to what the SeaGlider Basestation returns with this function. The difference of this function with the basestation is on average ~ 0.003 kg/m3 """ try: import gsw salt_abs = gsw.SA_from_SP(salt_PSU, pres_db, lon, lat) temp_pot = gsw.t_from_CT(salt_abs, temp_C, pres_db) pot_dens = gsw.pot_rho_t_exact(salt_abs, temp_pot, pres_db, pres_ref) except ImportError: import seawater as sw pot_dens = sw.pden(salt_PSU, temp_C, pres_db, pres_ref) pot_dens = transfer_nc_attrs( getframe(), temp_C, pot_dens, 'potential_density', units='kg/m3', comment='', standard_name='potential_density', ) return pot_dens
def brunt_vaisala(salt, temp, pres, lat=None): r""" Calculate the square of the buoyancy frequency. This is a copy from GSW package, with the exception that the array maintains the same shape as the input. Note that it only works on ungridded data at the moment. .. math:: N^{2} = \frac{-g}{\sigma_{\theta}} \frac{d\sigma_{\theta}}{dz} Parameters ---------- SA : array-like Absolute Salinity, g/kg CT : array-like Conservative Temperature (ITS-90), degrees C p : array-like Sea pressure (absolute pressure minus 10.1325 dbar), dbar lat : array-like, 1-D, optional Latitude, degrees. axis : int, optional The dimension along which pressure increases. Returns ------- N2 : array Buoyancy frequency-squared at pressure midpoints, 1/s. The shape along the pressure axis dimension is one less than that of the inputs. """ from gsw import Nsquared from numpy import nan, r_ def pad_nan(a): r_[a, nan] n2 = pad_nan(Nsquared(salt, temp, pres)[0]) n2 = transfer_nc_attrs( getframe(), temp, n2, 'N_squared', units='1/s2', comment='', standard_name='brunt_vaisala_freq', ) return n2
def find_signature(sign): path = Path(inspect.getfile( inspect.getframe())).parent / 'signatures.json' with path.open('r') as data_file: data = json.load(data_file) list_name = [name for name, hexa in data.items() if hexa == sign] if len(list_name) > 1: logging.warning('function signatures collision: %s', list_name) return '_or_'.join(list_name) elif list_name: return list_name[0] else: return None
def mask_bad_dive_fraction(mask, dives, var, mask_frac=0.2): """ Find bad dives - where more than a fraction of the dive is masked Parameters ---------- mask : array, dtype=bool, shape=[n, ] boolean 1D array with masked values dives : array, dtype=float, shape=[n, ] discrete dive numbers (down round, up n.5) var : array, dtype=float, shape=[n, ] series or array containing data that will be masked with NaNs mask_frac : int=0.2 fraction of the dive that is masked for the whole dive to be bad Returns ------- var : array, dtype=float, shape=[n, ] the same as the input, but has been masked mask_dives : array, dtype=bool a mask array that has full dives that are deemed "bad" masked out """ from numpy import NaN, array from pandas import Series # catch dives where the marjority of the data is masked # and return a fully masked dive dives = array(dives) arr = array(var) grp = Series(mask).groupby(dives) masked_frac_per_dive = grp.sum() / grp.count() > mask_frac majority_masked = masked_frac_per_dive[masked_frac_per_dive].index.values # create a mask that masks ungridded data mask_dives = mask.copy() for d in majority_masked: i = array(dives) == d mask_dives[i] = True arr[mask_dives] = NaN baddive = arr baddive = transfer_nc_attrs(getframe(), var, baddive, None) return baddive, mask_dives
def spice0(salt_PSU, temp_C, pres_db, lat, lon): """ Calculate spiciness from glider measurements of salinity and temperature. Parameters ---------- salt_PSU : array, dtype=float, shape=[n, ] practical salinty temp_C : array, dtype=float, shape=[n, ] temperature in deg C pres_db : array, dtype=float, shape=[n, ] pressure in decibar lat : array, dtype=float, shape=[n, ] latitude in degrees north lon : array, dtype=float, shape=[n, ] longitude in degrees east Returns ------- potential_density : array, dtype=float, shape=[n, ] Note ---- Using seawater.dens does not yield the same results as this function. We get very close results to what the SeaGlider Basestation returns with this function. The difference of this function with the basestation is on average ~ 0.003 kg/m3 """ import gsw salt_abs = gsw.SA_from_SP(salt_PSU, pres_db, lon, lat) cons_temp = gsw.CT_from_t(salt_abs, temp_C, pres_db) spice0 = gsw.spiciness0(salt_abs, cons_temp) spice0 = transfer_nc_attrs( getframe(), temp_C, spice0, "spiciness0", units=" ", comment="", standard_name="spiciness0", ) return spice0
def par_dark_count(par, dives, depth, time): """ Calculates an in situ dark count from the PAR sensor. The in situ dark count for the PAR sensor is calculated from the median, with masking applied for values before 23:01 and outside the 90th % Parameters ---------- par: numpy.ndarray or pandas.Series The par array after factory calibration in units uE/m2/sec. dives: numpy.ndarray or pandas.Series The dive count (round is down dives, 0.5 is up dives). depth: numpy.ndarray or pandas.Series The depth array in metres. time: numpy.ndarray or pandas.Series The date & time array in a numpy.datetime64 format. Returns ------- par_dark: numpy.ndarray or pandas.Series The par data corrected for the in situ dark value in units uE/m2/sec. """ from numpy import array, ma, nanmedian, isnan, nanpercentile par_arr = array(par) dives = array(dives) depth = array(depth) time = array(time) # DARK CORRECTION FOR PAR hrs = time.astype('datetime64[h]') - time.astype('datetime64[D]') xi = ma.masked_inside(hrs.astype(int), 22, 2) # find 23:01 hours yi = ma.masked_outside(depth, *nanpercentile(depth[~isnan(par)], [90, 100])) # 90th pctl of depth i = ~(xi.mask | yi.mask) dark = nanmedian(par_arr[i]) par_dark = par_arr - dark par_dark[par_dark < 0] = 0 par_dark = transfer_nc_attrs(getframe(), par, par_dark, '_dark') return par_dark
def outlier_bounds_iqr(arr, multiplier=1.5): r""" Mask values outside the upper/lower outlier limits by interquartile range: .. math:: lim_{low} = Q_1 - 1.5\cdot(Q_3 - Q_1)\\ lim_{up} = Q_3 + 1.5\cdot(Q_3 - Q_1) the multiplier [1.5] can be adjusted by the user returns the lower_limit, upper_limit Parameters ---------- arr : np.array|xr.DataArray, dtype=float, shape=[n, ] the full timeseries of the entire dataset multiplier : float=1.5 sets the interquartile range Returns ------- arr : array | xarray.DataArray A data object where values outside the limits are masked. Metdata will be preserved if the original input array is xr.DataArray """ from numpy import array, nan, nanpercentile var = arr.copy() arr = array(arr) q1, q3 = nanpercentile(arr, [25, 75]) iqr = q3 - q1 ll = q1 - iqr * multiplier ul = q3 + iqr * multiplier mask = (arr < ll) | (arr > ul) arr[mask] = nan attrs = dict(outlier_lims=[ll, ul]) out = transfer_nc_attrs(getframe(), var, arr, "_outlierIQR", **attrs) return out
def outlier_bounds_std(arr, multiplier=3): r""" Mask values outside the upper and lower outlier limits by standard deviation :math:`\mu \pm 3\sigma` the multiplier [3] can be adjusted by the user returns the lower_limit, upper_limit Parameters ---------- arr : np.array|xr.DataArray, dtype=float, shape=[n, ] the full timeseries of the entire dataset multiplier : float=1.5 sets the standard deviation multiplier Returns ------- arr : array | xarray.DataArray A data object where values outside the limits are masked. Metdata will be preserved if the original input array is xr.DataArray """ from numpy import array, nan, nanmean, nanstd var = arr.copy() arr = array(arr) mean = nanmean(arr) std = nanstd(arr) ll = mean - std * multiplier ul = mean + std * multiplier mask = (arr < ll) | (arr > ul) arr[mask] = nan attrs = dict(outlier_lims=[ll, ul]) out = transfer_nc_attrs(getframe(), var, arr, "_outlierSTD", **attrs) return out
def predict(self, x): """ A wrapper around the normal predict function that takes nans into account. An extra dimension is also added if needed. """ from xarray import DataArray var = x.copy() x = _np.array(x) out = _np.ndarray(x.size) * _np.NaN i = ~_np.isnan(x) x = x[i].reshape(-1, 1) out[i.squeeze()] = self._predict(x).squeeze() out = transfer_nc_attrs(getframe(), var, out, "_calibrated") if hasattr(self, "info") & isinstance(out, DataArray): out.attrs["model_info"] = str(self.info) return out
def fluorescence_dark_count(flr, depth, percentile=5): """ Calculates an in situ dark count from the fluorescence sensor. The in situ dark count for the fluorescence sensor is calculated from the user-defined percentile between 300 and 400m. Parameters ---------- flr: numpy.ndarray or pandas.Series The fluorescence array after factory calibration. depth: numpy.ndarray or pandas.Series The depth array in metres. Returns ------- flr: numpy.ndarray or pandas.Series The fluorescence data corrected for the in situ dark value. """ from numpy import array, isnan, nanpercentile import warnings mask = (depth > 300) & (depth < 400) flr_dark = array(flr) if (~isnan(flr_dark[mask])).sum() == 0: warnings.warn( "\nThere are no fluorescence measurements between " "300 and 400 metres.\nThe dark count correction " "cannot be made and fluorescence data can't be processed.", UserWarning, ) dark_pctl = nanpercentile(flr_dark[mask], percentile) flr_dark -= dark_pctl flr_dark[flr_dark < 0] = 0 flr_dark = transfer_nc_attrs(getframe(), flr, flr_dark, "_dark") return flr_dark
def backscatter_dark_count(bbp, depth, percentile=5): """ Calculates an in situ dark count from the backscatter sensor. The in situ dark count for the backscatter sensor is calculated from the user-defined percentile between 200 and 400m. Parameters ---------- bbp: numpy.ndarray or pandas.Series The total backscatter array after factory calibration in m-1. depth: numpy.ndarray or pandas.Series The depth array in metres. Returns ------- bbp: numpy.ndarray or pandas.Series The total backscatter data corrected for the in situ dark value. """ from numpy import array, isnan, nanpercentile import warnings bbp_dark = array(bbp) mask = (depth > 200) & (depth < 400) if (~isnan(bbp[mask])).sum() == 0: warnings.warn( "There are no backscatter measurements between 200 " "and 400 metres.The dark count correction cannot be " "made and backscatter data can't be processed.", UserWarning, ) dark_pctl = nanpercentile(bbp_dark[mask], percentile) bbp_dark -= dark_pctl bbp_dark[bbp_dark < 0] = 0 bbp_dark = transfer_nc_attrs(getframe(), bbp, bbp_dark, "_dark") return bbp_dark
def rolling_window(var, func, window): """ A rolling window function that is nan-resiliant Parameters ---------- arr:array, dtype=float, shape=[n, ] array that you want to pass the rolling window over func : callable an aggregating function. e.g. mean, std, median window : int the size of the rolling window that will be applied Returns ------- arr : array, dtype=float, shape=[n, ] the same as the input array, but the rolling window has been applied """ from numpy import array, nan, ndarray, r_ n = window # create an empty 2D array with shape (window, len(arr)) arr = array(var) mat = ndarray([n, len(arr) - n]) * nan # create a vector for each window for i in range(n): mat[i, :] = arr[i:i - n] # get the mean or meidan or any other function of the matrix out = func(mat, axis=0) # the array will be shorter than the original # pad the output with the rolling average of the values left out i0 = n // 2 i1 = n - i0 seg0 = array([func(arr[:i + 1]) for i in range(i0)]) seg1 = array([func(arr[-i - 1:]) for i in range(i1)]) rolwin = r_[seg0, out, seg1] rolwin = transfer_nc_attrs(getframe(), var, rolwin, "_rollwin") return rolwin
def fluorescence_dark_count(flr, depth): """ Calculates an in situ dark count from the fluorescence sensor. The in situ dark count for the fluorescence sensor is calculated from the 95th percentile between 300 and 400m. Parameters ---------- flr: numpy.ndarray or pandas.Series The fluorescence array after factory calibration. depth: numpy.ndarray or pandas.Series The depth array in metres. Returns ------- flr: numpy.ndarray or pandas.Series The fluorescence data corrected for the in situ dark value. """ from numpy import nanpercentile, isnan, array mask = (depth > 300) & (depth < 400) flr_dark = array(flr) if (~isnan(flr_dark[mask])).sum() == 0: raise UserWarning( '\nThere are no fluorescence measurements between ' '300 and 400 metres.\nThe dark count correction ' "cannot be made and fluorescence data can't be processed.") dark_pctl5 = nanpercentile(flr_dark[mask], 5) flr_dark -= dark_pctl5 flr_dark[flr_dark < 0] = 0 flr_dark = transfer_nc_attrs(getframe(), flr, flr_dark, '_dark') return flr_dark
def time_average_per_dive(dives, time): """ Gets the average time stamp per dive. This is used to create psuedo discrete time steps per dive for plotting data (using time as x-axis variable). Parameters ---------- dives : np.array, dtype=float, shape=[n, ] discrete dive numbers (down = d.0; up = d.5) that matches time length time : np.array, dtype=datetime64, shape=[n, ] time stamp for each observed measurement Returns ------- time_average_per_dive : np.array, dtype=datetime64, shape=[n, ] each dive will have the average time stamp of that dive. Can be used for plotting where time_average_per_dive is set as the x-axis. """ from numpy import array, datetime64, nanmean from pandas import Series atime = array(time) dives = array(dives) if isinstance(atime[0], datetime64): t = atime.astype("datetime64[s]").astype(float) else: t = atime t_grp = Series(t).groupby(dives) t_mid = nanmean([t_grp.max(), t_grp.min()], axis=0) t_ser = Series(t_mid, index=t_grp.mean().index.values) diveavg = t_ser.reindex(index=dives).values diveavg = diveavg.astype("datetime64[s]") diveavg = transfer_nc_attrs(getframe(), time, diveavg, "_diveavg") return diveavg
def backscatter_dark_count(bbp, depth): """ Calculates an in situ dark count from the backscatter sensor. The in situ dark count for the backscatter sensor is calculated from the 95th percentile between 200 and 400m. Parameters ---------- bbp: numpy.ndarray or pandas.Series The total backscatter array after factory calibration in m-1. depth: numpy.ndarray or pandas.Series The depth array in metres. Returns ------- bbp: numpy.ndarray or pandas.Series The total backscatter data corrected for the in situ dark value. """ from numpy import nanpercentile, isnan, array bbp_dark = array(bbp) mask = (depth > 200) & (depth < 400) if (~isnan(bbp[mask])).sum() == 0: raise UserWarning('There are no backscatter measurements between 200 ' 'and 400 metres.The dark count correction cannot be ' "made and backscatter data can't be processed.") dark_pctl5 = nanpercentile(bbp_dark[mask], 5) bbp_dark -= dark_pctl5 bbp_dark[bbp_dark < 0] = 0 bbp_dark = transfer_nc_attrs(getframe(), bbp, bbp_dark, '_dark') return bbp
def quenching_correction( flr, bbp, dives, depth, time, lat, lon, max_photic_depth=100, night_day_group=True, surface_layer=5, sunrise_sunset_offset=1, ): """ Corrects the fluorescence data based upon Thomalla et al. (2017). The function calculates the quenching depth and performs the quenching correction based on the fluorescence to backscatter ratio. The quenching depth is calculated based upon the different between night and daytime fluorescence. The default setting is for the preceding night to be used to correct the following day's quenching (`night_day_group=True`). This can be changed so that the following night is used to correct the preceding day. The quenching depth is then found from the difference between the night and daytime fluorescence, using the steepest gradient of the {5 minimum differences and the points the difference changes sign (+ve/-ve)}. The function gets the backscatter/fluorescence ratio between from the quenching depth to the surface, and then calculates a mean nighttime ratio for each night. The quenching ratio is calculated from the nighttime ratio and the daytime ratio, which is then applied to fluorescence to correct for quenching. If the corrected value is less than raw, then the function will return the original raw data. Parameters ---------- flr: numpy.ndarray or pandas.Series fluorescence data after cleaning and factory calibration conversion bbp: numpy.ndarray or pandas.Series Total backscatter after cleaning and factory calibration conversion dives: numpy.ndarray or pandas.Series The dive count (round is down dives, 0.5 is up dives). depth: numpy.ndarray or pandas.Series The depth array in metres. time: numpy.ndarray or pandas.Series The date & time array in a numpy.datetime64 format. lat: numpy.ndarray or pandas.Series The latitude of the glider position. lon: numpy.ndarray or pandas.Series The longitude of the glider position. max_photic_depth: int Limit the quenching correction to depth less than a given value [100]. night_day_group: bool If True, use preceding night otherwise use following night for calculating the flr:bbp ratio. surface_layer: int The surface depth that is omitted from the correction calculations (metres) sunrise_sunset_offset: int The delayed onset and recovery of quenching in hours [1] (assumes symmetrical). Returns ------- flr_corrected: numpy.ndarray or pandas.Series The fluorescence data corrected for quenching. quenching layer: bool A boolean mask of where the fluorescence is quenched. """ import numpy as np import pandas as pd from scipy.interpolate import Rbf from .cleaning import rolling_window def grad_min(depth, fluor_diff, surface_layer=5): """ TODO: need to refine this function. Doesn't always correct to the deepest quenching point Quenching depth for a day/night fluorescence difference INPUT: depth and fluorescence as pd.Series or np.ndarray surface_layer [5] is the depth to search for the reference in the gradient OUPUT: Quenching layer as a boolean mask """ if depth.size <= surface_layer: return np.zeros(depth.size).astype(bool) x = np.array(depth) y = rolling_window(np.array(fluor_diff), np.nanmean, 5) s = x < surface_layer # surface data to the top 5 metres mask = np.zeros(depth.size).astype(bool) # get the smallest 5 points and where the difference crosses 0 small5 = np.argsort(np.abs(y))[:5] cross0 = np.where(np.r_[False, np.diff((y) > 0)])[0] # combine the indicies i = np.unique(np.r_[small5, cross0]) # the max in the surface as a reference if not s.sum(): return mask j = y[s].argmax() # calculate the gradient of the selected points to the reference grad = (y[s][j] - y[i]) / (x[s][j] - x[i]) # If there are only nans in the gradient return only nans if np.isnan(grad).all(): return mask # get the index of the steepest gradient (min) grad_min_i = i[np.nanargmin(grad)] # fill the mask with True values above the quenching depth mask[0:grad_min_i] = True # on up dives the array is backwards so reverse the mask if x[-1] < x[0]: mask = ~mask # If the majority of the points in the selected region are # negative (night < day) then return an empty mask return mask var = flr.copy() # create a copy for netCDF attrs preservation flr = np.array(flr) bbp = np.array(bbp) dives = np.array(dives) depth = np.array(depth) time = np.array(time) lat = np.array(lat) lon = np.array(lon) # ############################ # # GENERATE DAY/NIGHT BATCHES # # ############################ # sunrise, sunset = sunset_sunrise(time, lat, lon) offset = np.timedelta64(sunrise_sunset_offset, 'h') # creating quenching correction batches, where a batch is a night and the # following day day = (time > (sunrise + offset)) & (time < (sunset + offset)) # find day and night transitions daynight_transitions = np.abs(np.diff(day.astype(int))) # get the cumulative sum of daynight to generate separate batches for day # and night daynight_batches = daynight_transitions.cumsum() # now get the batches with padded 0 to account for the diff # also add a bool that makes night_day or day_night batches batch = np.r_[0, (daynight_batches + night_day_group) // 2] isday = (np.r_[0, daynight_batches] / 2 % 1) == 0 # ######################## # # GET NIGHTTIME AVERAGES # # ######################## # # blank arrays to be filled flr_night, bbp_night = flr.copy(), bbp.copy() # create a dataframe with fluorescence and backscatter df = pd.DataFrame(np.c_[flr, bbp], columns=['flr', 'bbp']) # get the binned averages for each batch and select the night night_ave = df.groupby([day, batch, np.around(depth)]).mean() night_ave = night_ave.dropna().loc[False] # A second group where only batches are grouped grp_batch = df.groupby(batch) # GETTING NIGHTTIME AVERAGE FOR NONGRIDDED DATA - USE RBF INTERPOLATION for b in np.unique(night_ave.index.labels[0]): i = grp_batch.groups[b].values # batch index j = i[~np.isnan(flr[i]) & (depth[i] < 400)] # index without nans x = night_ave.loc[b].index.values # batch depth y = night_ave.loc[b] # batch flr and bbp if y.flr.isna().all() | y.bbp.isna().all(): continue elif y.flr.size <= 2: continue # radial basis functions with a smoothing factor f1 = Rbf(x, y.flr.values, function='linear', smooth=20) f2 = Rbf(x, y.bbp.values, function='linear', smooth=20) # interpolation function is used to find flr and bbp for all # nighttime fluorescence flr_night[j] = f1(depth[j]) bbp_night[j] = f2(depth[j]) # calculate the difference between average nighttime - and fluorescence fluor_diff = flr_night - flr # ################################ # # FIND THE QUENCHING DEPTH LAYER # # ################################ # # create a "photic layer" mask to which calc will be limited daytime, # shalower than [100m] and fluoresence is quenched relative to night photic_layer = isday & (depth < max_photic_depth) & (fluor_diff > 0) # blank array to be filled quenching_layer = np.zeros(depth.size).astype(bool) # create a grouped dataset by dives to find the depth of quenching cols = np.c_[depth, fluor_diff, dives][photic_layer] grp = pd.DataFrame(cols, columns=['depth', 'flr_dif', 'dives']) grp = grp.groupby('dives') # apply the minimum gradient algorithm to each dive quench_mask = grp.apply(lambda df: grad_min(df.depth, df.flr_dif)) # fill the quench_layer subscripted to the photic layer quenching_layer[photic_layer] = np.concatenate([l for l in quench_mask]) # ################################### # # DO THE QUENCHING CORRECTION MAGIC # # ################################### # # a copy of fluorescence to be filled with quenching corrected data flr_corrected = flr.copy() # nighttime backscatter to fluorescence ratio flr_bb_night = flr_night / bbp_night # quenching ratio for nighttime quench_ratio = flr_bb_night * bbp / flr # apply the quenching ratio to the fluorescence quench_corrected = flr * quench_ratio # if unquenched data is corrected return the original data mask = quench_corrected < flr quench_corrected[mask] = flr[mask] # fill the array with queching corrected data in the quenching layer only flr_corrected[quenching_layer] = quench_corrected[quenching_layer] flr_corrected = transfer_nc_attrs(getframe(), var, flr_corrected, 'flr_quench_corrected', units='RFU') quenching_layer = transfer_nc_attrs(getframe(), var, quenching_layer, 'quench_layer', units='') return flr_corrected, quenching_layer
def par_fill_surface(par, dives, depth, max_curve_depth=100): """ Algebraically calculates the top 5 metres of the par profile. The function removes the top 5 metres of par data, and then using an exponential equation calculates the complete profile. Parameters ---------- par: numpy.ndarray or pandas.Series The par data with units uE/m2/sec. dives: numpy.ndarray or pandas.Series The dive count (round is down dives, 0.5 is up dives). depth: numpy.ndarray or pandas.Series The depth array in metres. max_curve_depth: int The maximum depth of which to fit the exponential function. Returns ------- par_filled: numpy.ndarray or pandas.Series The par data with the algebraically calculated top 5 metres. """ from scipy.optimize import curve_fit import numpy as np def dive_par_fit(depth, par): def exp_func(x, a, b): return a * np.exp(b * x) xj, yj = depth, par mask = ~(np.isnan(xj) | np.isnan(yj)) & (xj < max_curve_depth) xm, ym = xj[mask], yj[mask] if all(ym == 0) | (mask.sum() <= 2): yj_hat = np.ones_like(depth) * np.nan else: try: [a, b], _ = curve_fit(exp_func, xm, ym, p0=(500, -0.03), maxfev=1000) yj_hat = exp_func(xj, a, b) except RuntimeError: yj_hat = np.ones_like(depth) * np.nan return yj_hat var = par.copy() par = np.array(par) dives = np.array(dives) depth = np.array(depth) par_filled = np.ones_like(depth) * np.nan for d in np.unique(dives): i = dives == d par_fit = dive_par_fit(depth[i], par[i]) par_filled[i] = par_fit par_filled = transfer_nc_attrs(getframe(), var, par_filled, 'par_expfill') return par_filled
def bottle_matchup( gld_dives, gld_depth, gld_time, btl_depth, btl_time, btl_values, min_depth_diff_metres=5, min_time_diff_minutes=120, ): """ Performs a matchup between glider and bottle samples based on time and depth (or density). Parameters ---------- gld_depth : np.array, dtype=float glider depth at time of measurement gld_dives : np.array, dtype=float dive index of the glider (given by glider toolbox) gld_time : np.array, dtype=datetime64 glider time that will be used as primary indexing variable btl_time: np.array, dtype=datetime64 in-situ bottle sample's time btl_depth : np.array, dtype=float depth of in-situ sample btl_values : np.array, dtype=float the value that will be interpolated onto the glider time and depth coordinates (time, depth/dens) min_depth_diff_metres : float, default=5 the minimum allowable depth difference min_time_diff_minutes : float, default=120 the minimum allowable time difference between bottles and glider Returns ------- array : float Returns the bottle values in the format of the glider i.e. the length of the output will be the same as gld_* """ from pandas import Series # metadata preservation var = gld_depth.copy() if isinstance(btl_values, Series): var_name = btl_values.name + "_bottle_matchups" else: var_name = "bottle_matchups" # make all input variables np.arrays args = gld_time, gld_depth, gld_dives, btl_time, btl_depth, btl_values gld_time, gld_depth, gld_dives, btl_time, btl_depth, btl_values = map( _np.array, args ) # create a blank array that matches glider data # (placeholder for calibration bottle values) gld_cal = _np.ones_like(gld_depth) * _np.nan # loop through each ship based CTD station stations = _np.unique(btl_time) for c, t in enumerate(stations): # index of station from ship CTD btl_idx = t == btl_time # number of samples per station btl_num = btl_idx.sum() # string representation of station time t_str = str(t.astype("datetime64[m]")).replace("T", " ") t_dif = abs(gld_time - t).astype("timedelta64[m]").astype(float) # loop through depths for the station if t_dif.min() < min_time_diff_minutes: # index of dive where minimum difference occurs i = _np.where(gld_dives[_np.nanargmin(t_dif)] == gld_dives)[0] n_depths = 0 for depth in btl_depth[btl_idx]: # an index for bottle where depth and station match j = btl_idx & (depth == btl_depth) # depth difference for glider profile d_dif = abs(gld_depth - depth)[i] # only match depth if diff is less than given threshold if _np.nanmin(d_dif) < min_depth_diff_metres: # index of min diff for this dive k = i[_np.nanargmin(d_dif)] # assign the bottle values to the calibration output gld_cal[k] = btl_values[j] n_depths += 1 print( ( "[stn {}/{}] SUCCESS: {} ({} of {} samples) match-up " "within {} minutes" ).format(c, stations.size, t_str, n_depths, btl_num, t_dif.min()) ) else: print( ( "[stn {}/{}] FAILED: {} Couldn't find samples within " "constraints" ).format(c, stations.size, t_str) ) attrs = dict(units="", positive="", comment="", standard_name="", axis="") gld_cal = transfer_nc_attrs(getframe(), var, gld_cal, var_name, **attrs) return gld_cal
def horizontal_diff_outliers(dives, depth, arr, multiplier=1.5, depth_threshold=450, mask_frac=0.2): """ Find Z-score outliers (> 3) on the horizontal. Can be limited below a certain depth. The function uses the horizontal gradient as a threshold, below a defined depth threshold to find outliers. Useful to identify when a variable at depth is not the same as neighbouring values. Parameters ---------- dives: numpy.ndarray or pandas.Series The dive count (round is down dives, 0.5 is up dives) depth: numpy.ndarray or pandas.Series The depth array in metres arr: numpy.ndarray or pandas.Series Array of data variable for cleaning to be performed on. multiplier: float A z-score threshold depth_threshold: int Outliers will be identified below this depth value to the max depth value of the dive. mask_frac: float When the ratio of bad values per dive is greater than this value, then the dive will be masked. Returns ------- mask A mask of dives where the bad values per dive ratio is greater than mask_frac. """ from numpy import abs, arange, array, inf, nanmean, nanstd from .mapping import grid_data var = arr.copy() dives = array(dives) depth = array(depth) arr = array(arr) # grid data so that the horizontal rolling median can be calculated # we use a window of 3 to find only "horizonal spikes" gridded = grid_data( dives, depth, array(arr), bins=arange(0, depth.max(), 1), verbose=False, return_xarray=False, ) median = gridded.rolling(3, axis=1, center=True, min_periods=2).median() # get zscore of the difference between the median and the raw data diff = gridded - median zdiff = abs(diff - nanmean(diff)) / nanstd(diff) # this finds the 99.7th percentile outliers # note that this is based on the global horizonal diff # but is only applied below the depth threshold # this means that the surface data sets a higher limit deep_outlier = zdiff.loc[depth_threshold:] >= multiplier # get the ratio of bad values per dive and mask if it # exceeds a user defined fraction deep_outlier_count = deep_outlier.sum() deep_obs_num = gridded.shape[0] - depth_threshold # assumes bin of 1m deep_outlier_ratio = deep_outlier_count / deep_obs_num # finds the index where dives exceed the mask_frac threshold i = deep_outlier_ratio > mask_frac deep_outlier_dives = i[i].index.values mask = arr < -inf # create a dummy mask for d in deep_outlier_dives: i = dives == d mask[i] = True baddives = mask_bad_dive_fraction(mask, dives, arr, mask_frac=mask_frac)[0] out = transfer_nc_attrs(getframe(), var, baddives, "_horzOutlierSTD") return out
def calc_physics( variable, dives, depth, spike_window=3, spike_method="minmax", iqr=1.5, depth_threshold=400, mask_frac=0.2, savitzky_golay_window=11, savitzky_golay_order=2, verbose=True, name="Physics Variable", ): """ A standard setup for processing physics variables (temperature, salinity). The function applies a neighbourhood interquartile range (IQR) outlier filter, the Briggs et al. (2011) spike filter followed by a Savitzky-Golay smoothing function. The Savitzky-Golay filter is demonstrated well on wikipedia: https://en.wikipedia.org/wiki/Savitzky-Golay_filter """ from numpy import array, isnan from .cleaning import ( despike, horizontal_diff_outliers, outlier_bounds_iqr, savitzky_golay, ) # an interpolation step is added so that no nans are created. # Note that this interpolates on the flattened series var = variable.copy() # attribute preservation x = array(dives) y = array(depth) z = array(variable) printv(verbose, "\n" + "=" * 50 + "\n{}:".format(name)) if iqr: nans_before = isnan(z).sum() z = outlier_bounds_iqr(z, multiplier=iqr) nans_after = isnan(z).sum() n_masked = nans_after - nans_before printv( verbose, "\tRemoving outliers with IQR * {}: {} obs".format(iqr, n_masked), ) if spike_window: z = despike(z, spike_window, spike_method)[0] printv( verbose, "\tRemoving spikes with rolling median (spike window={})".format( spike_window), ) if depth_threshold: z = horizontal_diff_outliers(x, y, z, iqr, depth_threshold, mask_frac) printv( verbose, ("\tRemoving horizontal outliers " "(fraction={}, multiplier={})").format(mask_frac, iqr), ) if savitzky_golay_window: printv( verbose, ("\tSmoothing with Savitzky-Golay filter " "(window={}, order={})").format(savitzky_golay_window, savitzky_golay_order), ) z = savitzky_golay(z, savitzky_golay_window, savitzky_golay_order) z = transfer_nc_attrs(getframe(), var, z, "_processed") return z
def savitzky_golay(var, window_size, order, deriv=0, rate=1, interpolate=True): """ Smooth (and optionally differentiate) data with a Savitzky-Golay filter. The Savitzky-Golay filter removes high frequency noise from data [1]_. It has the advantage of preserving the original shape and features of the signal better than other types of filtering approaches, such as moving averages techniques. By default, nans in the array are interpolated with a limit set to the window size of the dataset before smoothing. The nans are inserted back into the dataset after the convolution. This limits the loss of data over blocks where there are nans. This can be switched off with the `interpolate` keyword arguement. Parameters ---------- var : array, dtype=float, shape=[n, ] the values of the time history of the signal. window_size : int the length of the window. Must be an odd integer number. order : int the order of the polynomial used in the filtering. Must be less then `window_size` - 1. deriv : int the order of the derivative to compute (default = 0 means only smoothing) interpolate : bool=True By default, nans in the array are interpolated with a limit set to the window size of the dataset before smoothing. The nans are inserted back into the dataset after the convolution. This limits the loss of data over blocks where there are nans. This can be switched off with the `interpolate` keyword arguement. Returns ------- ys : ndarray, shape (N) the smoothed signal (or it's n-th derivative). Notes ----- The Savitzky-Golay is a type of low-pass filter, particularly suited for smoothing noisy data. The main idea behind this approach is to make for each point a least-square fit with a polynomial of high order over a odd-sized window centered at the point [2]_. Examples -------- >>> t = linspace(-4, 4, 500) y = exp( -t**2 ) + random.normal(0, 0.05, t.shape) ysg = savitzky_golay(y, window_size=31, order=4) import matplotlib.pyplot as plt plt.plot(t, y, label='Noisy signal') plt.plot(t, exp(-t**2), 'k', lw=1.5, label='Original signal') plt.plot(t, ysg, 'r', label='Filtered signal') plt.legend() plt.show() References ---------- .. [1] A. Savitzky, M. J. E. Golay, Smoothing and Differentiation of Data by Simplified Least Squares Procedures. Analytical Chemistry, 1964, 36 (8), pp 1627-1639. .. [2] Numerical Recipes 3rd Edition: The Art of Scientific Computing W.H. Press, S.A. Teukolsky, W.T. Vetterling, B.P. Flannery Cambridge University Press ISBN-13: 9780521880688 """ from math import factorial from numpy import abs, array, concatenate, convolve, isnan, linalg, mat, nan from pandas import Series # sorting out window stuff arr = array(var) try: window_size = abs(int(window_size)) order = abs(int(order)) except ValueError: raise ValueError("window_size and order have to be of type int") if window_size % 2 != 1 or window_size < 1: raise TypeError("window_size size must be a positive odd number") if window_size < order + 2: raise TypeError("window_size is too small for the polynomial order") order_range = range(order + 1) half_window = (window_size - 1) // 2 # allow to interpolate for the window size if interpolate: ser = Series(arr).interpolate() y = array(ser) else: y = array(arr) # precompute coefficients b = mat([[k**i for i in order_range] for k in range(-half_window, half_window + 1)]) m = linalg.pinv(b).A[deriv] * rate**deriv * factorial(deriv) # pad the signal at the extremes with # values taken from the signal itself firstvals = y[0] - abs(y[1:half_window + 1][::-1] - y[0]) lastvals = y[-1] + abs(y[-half_window - 1:-1][::-1] - y[-1]) y = concatenate((firstvals, y, lastvals)) savgol = convolve(m[::-1], y, mode="valid") oldnans = isnan(arr) savgol[oldnans] = nan savgol = transfer_nc_attrs(getframe(), var, savgol, "_savgolay") return savgol
def calc_backscatter( bb_raw, tempC, salt, dives, depth, wavelength, dark_count, scale_factor, spike_window=7, spike_method="median", iqr=3, profiles_ref_depth=300, deep_multiplier=1, deep_method="median", return_figure=False, verbose=True, ): r""" The function processes the raw backscattering data in counts into total backscatter (bbp) in metres. The function uses a series of steps to clean the data before applying the Zhang et al. (2009) functions to convert the data into total backscatter (bbp/m)). The function uses functions from the flo_functions toolkit [1]_. The theta angle of sensors (124deg) and xfactor for theta 124 (1.076) are set values that should be updated if you are not using a WetLabs ECO BB2FL The following standard sequence is applied: 1. find IQR outliers (i.e. data values outside of the lower and upper limits calculated by cleaning.outlier_bounds_iqr) 2. find_bad_profiles (e.g. high values below 300 m are counted as bad profiles) 3. flo_scale_and_offset (factory scale and offset) 4. flo_bback_total (total backscatter based on Zhang et al. 2009) [2]_ 5. backscatter_dark_count (based on Briggs et al. 2011) [3]_ 6. despike (using Briggs et al. 2011 - rolling min--max) [3]_ Parameters ---------- bb_raw: np.array / pd.Series, dtype=float, shape=[n, ] The raw output from the backscatter channel in counts. tempC: np.array / pd.Series, dtype=float, shape=[n, ] The QC'd temperature data in degC. salt: np.array / pd.Series, dtype=float, shape=[n, ] The QC'd salinity in PSU. dives: np.array / pd.Series, dtype=float, shape=[n, ] The dive count (round is down dives, 0.5 is up dives). depth: np.array / pd.Series, dtype=float, shape=[n, ] The depth array in metres. wavelength: int The wavelength of the backscatter channel, e.g. 700 nm. dark_count: float The dark count factory values from the calibration sheet. scale_factor: float The scale factor factory values from the calibration sheet. spike_window: int The window size over which to run the despiking method. spike_method: str Whether to use a rolling median or combination of min+max filter as the despiking method. iqr: int Multiplier to determine the lower and upper limits of the interquartile range for outlier detection. profiles_ref_depth: int The depth threshold for optics.find_bad_profiles below which the median or mean is calculated for identifying outliers. deep_multiplier: int=1 The standard deviation multiplier for calculating outliers, i.e. :math:`\mu \pm \sigma \cdot[1]`. deep_method: str Whether to use the deep median or deep mean to determine bad profiles for optics.find_bad_profiles. return_figure: bool If True, will return a figure object that shows before and after the quenching correction was applied. verbose: bool If True, will print the progress of the processing function. Returns ------- baseline: numpy.ma.masked_array The despiked + bad profile identified backscatter with the mask denoting the filtered values of the backscatter baseline as defined in Briggs et al. (2011). quench_corrected: np.array / pd.Series, dtype=float, shape=[n, ] The backscatter spikes as defined in Briggs et al. (2011). figs: object The figures reporting the despiking, bad profiles and quenching correction. References ---------- .. [1] https://github.com/ooici/ion-functions Copyright (c) 2010, 2011 The Regents of the University of California .. [2] Zhang, X., Hu, L., & He, M. (2009). Scattering by pure seawater: Effect of salinity. Optics Express, 17(7), 5698. https://doi.org/10.1364/OE.17.005698 .. [3] Briggs, N., Perry, M. J., Cetinic, I., Lee, C., D'Asaro, E., Gray, A. M., & Rehm, E. (2011). High-resolution observations of aggregate flux during a sub-polar North Atlantic spring bloom. Deep-Sea Research Part I: Oceanographic Research Papers, 58(10), 1031–1039. https://doi.org/10.1016/j.dsr.2011.07.007 """ from numpy import array, count_nonzero, isnan, nan, unique from pandas import Series from . import flo_functions as ff from . import optics as op from .cleaning import despike, despiking_report, outlier_bounds_iqr var = bb_raw.copy() # metadata preservation bb_raw = Series(bb_raw.copy()) dives = array(dives) depth = array(depth) tempC = array(tempC) salt = array(salt) name = "bb{:.0f}".format(wavelength) theta = 124 # factory set angle of optical sensors xfactor = 1.076 # for theta 124 # Values taken from Sullivan et al. (2013) & Slade and Boss (2015) ref_depth = profiles_ref_depth stdev_multiplier = deep_multiplier method = deep_method dive_count = count_nonzero(unique(dives)) printv(verbose, "\n" + "=" * 50 + "\n{}:".format(name)) if iqr: nans_before = isnan(bb_raw).sum() bb_raw = outlier_bounds_iqr(bb_raw, multiplier=iqr) nans_after = isnan(bb_raw).sum() n_masked = nans_after - nans_before printv( verbose, "\tRemoving outliers with IQR * {}: {} obs".format(iqr, n_masked), ) printv( verbose, "\tMask bad profiles based on deep values (depth={}m)".format( ref_depth), ) bad_profiles = op.find_bad_profiles(dives, depth, bb_raw, ref_depth, stdev_multiplier, method) bb_raw[bad_profiles[0]] = nan bad_count = count_nonzero(bad_profiles[1]) printv( verbose, "\tNumber of bad profiles = {}/{}".format(bad_count, dive_count), ) printv(verbose, "\tZhang et al. (2009) correction") beta = ff.flo_scale_and_offset(bb_raw, dark_count, scale_factor) bbp = ff.flo_bback_total(beta, tempC, salt, theta, wavelength, xfactor) # This is from .Briggs et al. (2011) printv(verbose, "\tDark count correction") bbp = op.backscatter_dark_count(bbp, depth) printv( verbose, "\tSpike identification (spike window={})".format(spike_window), ) baseline, spikes = despike(bbp, spike_window, spike_method="median") baseline = Series(baseline, name="bb{:.0f}".format(wavelength)) baseline = transfer_nc_attrs( getframe(), var, baseline, name + "_baseline", units="units", standard_name="backscatter", ) spikes = transfer_nc_attrs( getframe(), var, spikes, name + "_spikes", units="units", standard_name="backscatter", ) if not return_figure: return baseline, spikes else: printv(verbose, "\tGenerating figure for despiking report") fig = despiking_report(dives, depth, bbp, baseline, spikes, name=name) return baseline, spikes, fig
def calc_fluorescence( flr_raw, bbp, dives, depth, time, lat, lon, dark_count, scale_factor, spike_window=7, spike_method="median", night_day_group=True, sunrise_sunset_offset=1, profiles_ref_depth=300, deep_multiplier=1, deep_method="median", return_figure=False, verbose=True, ): r""" This function processes raw fluorescence and corrects for quenching using the Thomalla et al. (2018) approach [1]_. The following standard sequence is applied: 1. find_bad_profiles (e.g. high Fluorescence in > 300 m water signals bad profile) 2. fluorescence_dark_count & scale factor (i.e. factory correction) 3. despike (using Briggs et al. 2011 - rolling min--max) 4. quenching_correction (corrects for quenching with Thomalla et al. 2017) Parameters ---------- flr_raw: np.array / pd.Series, dtype=float, shape=[n, ] The raw output of fluorescence data in instrument counts. bbp: np.array / pd.Series, dtype=float, shape=[n, ] The processed backscatter data from the less noisy channel, i.e. the one dataset with less spikes or bad profiles. dives: np.array / pd.Series, dtype=float, shape=[n, ] The dive count (round is down dives, 0.5 is up dives). depth: np.array / pd.Series, dtype=float, shape=[n, ] The depth array in metres. time: np.array / pd.Series, dtype=float, shape=[n, ] The date & time array in a numpy.datetime64 format. lat: np.array / pd.Series, dtype=float, shape=[n, ] The latitude of the glider position. lon: np.array / pd.Series, dtype=float, shape=[n, ] The longitude of the glider position. dark_count: float The dark count factory values from the calibration sheet. scale_factor: float The scale factor factory values from the calibration sheet. spike_window: int=7 The window size over which to run the despiking method. spike_method: str=median Whether to use a rolling median or combination of min+max filter as the despiking method. night_day_group: bool=True If True, use preceding night otherwise use following night for calculating the flr:bbp ratio. sunrise_sunset_offset: int=1 The delayed onset and recovery of quenching in hours [1] (assumes symmetrical). profiles_ref_depth: int=300 The depth threshold for optics.find_bad_profiles below which the median or mean is calculated for identifying outliers. deep_multiplier: int=1 The standard deviation multiplier for calculating outliers, i.e. mean ± std x [1]. deep_method: str='median' Whether to use the deep median or deep mean to determine bad profiles for optics.find_bad_profiles. return_figure: bool=False If True, will return a figure object that shows before and after the quenching correction was applied. verbose: bool=True If True, will print the progress of the processing function. Returns ------- baseline: array, dtype=float, shape=[n, ] The despiked + bad profile identified fluorescence that has not had the quenching correction applied. quench_corrected: array, dtype=float, shape=[n, ] The fluorescence data corrected for quenching. quench_layer: array, dtype=bool, shape=[n, ] The quenching layer as a mask. figs: object The figures reporting the despiking, bad profiles and quenching correction. References ---------- .. [1] Thomalla, S. J., Moutier, W., Ryan-Keogh, T. J., Gregor, L., & Schutt, J. (2018). An optimized method for correcting fluorescence quenching using optical backscattering on autonomous platforms. Limnology and Oceanography: Methods, 16(2), 132–144. https://doi.org/10.1002/lom3.10234 """ from numpy import array, count_nonzero, nan, unique from . import optics as op from .cleaning import despike, despiking_report var = flr_raw.copy() # metdata preservation flr_raw = array(flr_raw) bbp = array(bbp) dives = array(dives) depth = array(depth) time = array(time) lat = array(lat) lon = array(lon) ref_depth = profiles_ref_depth stdev_multiplier = deep_multiplier method = deep_method printv( verbose, ("\n" + "=" * 50 + "\nFluorescence\n\tMask bad profiles based on " "deep values (ref depth={}m)").format(ref_depth), ) bad_profiles = op.find_bad_profiles(dives, depth, flr_raw, ref_depth, stdev_multiplier, method) flr_raw[bad_profiles[0]] = nan bad_count = count_nonzero(bad_profiles[1]) dive_count = count_nonzero(unique(dives)) printv( verbose, "\tNumber of bad profiles = {}/{}".format(bad_count, dive_count), ) printv(verbose, "\tDark count correction") flr_raw -= dark_count flr_dark = op.fluorescence_dark_count(flr_raw, depth) flr_dark[flr_dark < 0] = nan baseline, spikes = despike(flr_dark, spike_window, spike_method="median") printv(verbose, "\tQuenching correction") quench_corrected, quench_layer = op.quenching_correction( baseline, bbp, dives, depth, time, lat, lon, sunrise_sunset_offset=1, night_day_group=True, ) printv( verbose, "\tSpike identification (spike window={})".format(spike_window), ) baseline = transfer_nc_attrs( getframe(), var, baseline, "FLR_baseline", units="RFU", standard_name="", ) quench_corrected = transfer_nc_attrs( getframe(), var, quench_corrected, "FLR_quench_corrected", units="RFU", standard_name="fluorescence", ) quench_layer = transfer_nc_attrs( getframe(), var, quench_layer, "quenching_layer", units="", standard_name="", comment="", ) if return_figure: printv(verbose, "\tGenerating figures for despiking and quenching report") figs = (despiking_report( dives, depth, flr_raw, baseline.data, spikes, name="Fluorescence", ), ) figs += (op.quenching_report( baseline.data, quench_corrected.data, quench_layer, dives, depth, ), ) return baseline, quench_corrected, quench_layer, figs else: return baseline, quench_corrected, quench_layer
def calc_par( par_raw, dives, depth, time, scale_factor_wet_uEm2s, sensor_output_mV, curve_max_depth=80, verbose=True, ): """ Calculates the theoretical PAR based on an exponential curve fit. The processing steps are: 1. par_scaling (factory cal sheet scaling) 2. par_dark_count (correct deep par values to 0 using 5th %) 3. par_fill_surface (return the theoretical curve of par based exponential fit) Parameters ---------- All inputs must be ungridded np.ndarray or pd.Series data par_raw : array, dtype=float, shape=[n, ] raw PAR dives : array, dtype=float, shape=[n, ] the dive count (round is down dives, 0.5 up dives) depth : array, dtype=float, shape=[n, ] in metres time : array, dtype=float, shape=[n, ] as a np.datetime64 array Returns ------- par_filled : array, dtype=float, shape=[n, ] PAR with filled surface values. """ from numpy import array from . import optics as op var = par_raw.copy() # metdata presrevation par_raw = array(par_raw) dives = array(dives) depth = array(depth) time = array(time) printv(verbose, "\n" + "=" * 50 + "\nPAR\n\tDark correction") # dark correction for par par_scaled = op.par_scaling(par_raw, scale_factor_wet_uEm2s, sensor_output_mV) par_dark = op.par_dark_count(par_scaled, dives, depth, time) printv(verbose, "\tFitting exponential curve to data") par_filled = op.par_fill_surface(par_dark, dives, depth, max_curve_depth=curve_max_depth) par_filled[par_filled < 0] = 0 attrs = dict( standard_name="photosynthetically_available_radiation", units="uE/m2/s2", comment="", ) par_filled = transfer_nc_attrs(getframe(), var, par_filled, "PAR_processed", **attrs) par_filled = par_filled.fillna(0) return par_filled
def interp_obj( # noqa: C901 x, y, z, xi, yi, partial_sill=0.1, nugget=0.01, lenscale_x=20, lenscale_y=20, detrend=True, max_points_per_quad=55, min_points_per_quad=8, return_error=False, n_cpus=None, verbose=True, parallel_chunk_size=512, ): """ Performs objective interpolation (or Kriging) of a 2D field. The objective interpolation breaks the problem into smaller fields by iteratively breaking the problem into quadrants. Each quadrant is then interpolated (also using intformation from its neighbours). The interpolation is inverse distance weighted using a gaussian kernel (or radial basis function). The kernel has a width of 12 hours if the x-dimension is time, otherwise scaled by the x-variable unit. The kernel is in meters assuming that depth is the y-coord. This can be changed with keyword arguements. An error estimate can also be calculated if requested. The following link provides good background on the Kriging procedure: http://desktop.arcgis.com/en/arcmap/10.3/tools/3d-analyst-toolbox/how-kriging-works.htm Parameters ---------- x : np.array | pd.series horizontal coordinates of the input data (same length as y, z) can be types float or datetime64 y : np.array | pd.series vertical coordinates of the input data (same length as x, z) z : np.array | pd.series values to be interoplated (same length as x, y) xi : np.array horizontal coordinates of the interpolation grid (must be 1D) can be types float or datetime64 yi : np.array | pd.series vertical coordinates of the interpolation grid (must be 1D) nugget : float [0.01] the error estimate due to sampling inaccuracy also known as the nugget in Kriging literature. This should be taken from the semivariogram partial_sill : float [0.1] represents the spatial covariance of the variable being interpolated. Should be estimated from the semivariogram. See Kriging literature for more information lenscale_x : float [20] horizontal length scale horizontal coordinate variable If dtype(x) is np.datetime64 (any format) then lenscale units is in hours. Otherwise if type(x). lenscale_y : float [20] horizontal length scale horizontal coordinate variable. max_points_per_quad : int [55] the data is divided into quadrants using a quadtree approach - iteratively dividing data into smaller quadrants using x and y coordinates. The algorithm stops splitting the data into quadrants when there are no quadrants exceeding the limit set with max_points_per_quad is. This is done to reduce the computational cost of the function. min_points_per_quad : int [8] sets the minimum number of points allowed in a neighbouring quadrant when creating the interpolation function for a particular quadrant. If the number of points is less than specified, the algorithm looks for neighbours of the neighbours to include more points in the interpolation. n_cpus : int [n - 1] use parallel computing. The quadrant calculations are spread across CPUs. Must be positive and > 0 parallel_chunk_size : int [512] the number of leaves that will be processed in parallel in one go. This is a memory saving feature. If your dataset is very large, parallel processing will use up a lot of memmory. Increasing the chunk size increases the memory requirements. verbose : bool [True] will print out information about the interpolation Returns ------- xr.Dataset Contains the following arrays: - z: interpolated values - variance: error estimate of the interpolation - weights: the quadtree weighting used to calculate the estimates - nugget: the nugget used in the interpolation - partial_sill: value used for the interpolation Note ---- The data may have semi-discrete artifacts. This is also present in the MATLAB output. Example ------- >>> xi = np.arange(time.values.min(), time.values.max(), 30, dtype='datetime64[m]') >>> yi = np.linspace(depth.min(), depth.max(), 1.) >>> interpolated = gt.mapping.interp_obj( time, depth, var, xi, yi, nugget=.0035, partial_sill=0.02, lenscale_x=80, lenscale_y=80, detrend=True) """ def get_detrend_model(x, y, z): model = linear_model.LinearRegression() model.fit(np.c_[x, y], z) return model import multiprocessing as mp from functools import partial from time import perf_counter as timer import xarray as xr from sklearn import linear_model if (n_cpus is None) | (n_cpus == 0): n_cpus = mp.cpu_count() - 1 if verbose: print("Starting Interpolation with quadtree optimal interpolation") print("----------------------------------------------------------") print("\nPreparing for interpolations:") zvar = z.copy() yvar = y.copy() xvar = x.copy() is_time_x = np.issubdtype(x.dtype, np.datetime64) is_time_xi = np.issubdtype(xi.dtype, np.datetime64) ymessage = "y-coordinates are not the same type (x={}, xi={})".format( y.dtype, yi.dtype) xmessage = "x-coordinates are not the same type (x={}, xi={})".format( x.dtype, xi.dtype) assert y.dtype == yi.dtype, ymessage assert (is_time_x + is_time_xi) != 1, xmessage if is_time_x: # convert data to hours if verbose: print("\tTime conversion") x = np.array(x).astype("datetime64[s]").astype(float) / 3600 xi = np.array(xi).astype("datetime64[s]").astype(float) / 3600 units_x = "hrs" else: units_x = "" if verbose: print("\tFinding and removing nans") nans = np.isnan(z) | np.isnan(x) | np.isnan(y) x, y, z = [np.array(a)[~nans] for a in [x, y, z]] # detrend data using linear regression if detrend: if verbose: print("\tRemoving data trend with linear regression") model = get_detrend_model(x, y, z) z_hat = model.predict(np.c_[x, y]) z -= z_hat else: if verbose: print("\tRemoving data mean") z_avg = np.nanmean(z) z -= z_avg if verbose: print("\tBuilding QuadTree") quad_tree = QuadTree(np.c_[x, y], max_points_per_quad=max_points_per_quad) xx, yy = np.array(np.meshgrid(xi, yi)).reshape(2, -1) leaves = quad_tree.leaves n = len(leaves) interp_info = "\n".join([ "\nInterpolation information:", "\tbasis points: {}".format(x.size), "\tinterp grid: {}, {}".format(xi.size, yi.size), "\tmax_points_per_quad: {}".format(max_points_per_quad), "\tmin_points_per_quad: {}".format(min_points_per_quad), "\tnumber of quads: {}".format(n), "\tdetrend_method: {}".format( "linear_regression" if detrend else "mean"), "\tpartial_sill: {}".format(partial_sill), "\tnugget: {}".format(nugget), "\tlengthscales: X = {} {}".format(lenscale_x, units_x), "\t Y = {} m".format(lenscale_y), ]) if verbose: print(interp_info) pool = mp.Pool(n_cpus) props = dict( z=z, xi=xx, yi=yy, nugget=nugget, partial_sill=partial_sill, lenscale_x=lenscale_x, lenscale_y=lenscale_y, min_points_per_quad=min_points_per_quad, return_error=return_error, verbose=verbose, ) func = partial(interp_leaf, **props) # predifining matricies for interpolation errors = np.ndarray(xx.size) * 0 weights = np.ndarray(xx.size) * 0 variable = np.ndarray(xx.size) * 0 # creating a timer to inform the user t0 = timer() # getting the index used to split the data up into chunks chunk_idx = np.arange(0, n, parallel_chunk_size, dtype=int) n_chunks = chunk_idx.size if verbose: print("\nProcessing interpolation in {} parts over {} CPUs:".format( n_chunks, n_cpus)) for c, i0 in enumerate(chunk_idx): i1 = i0 + parallel_chunk_size chunk_leaves = leaves[i0:i1] # do the parallel processing chunk_output = pool.map(func, chunk_leaves) # add the parallel chunk output to the output arrays for w, zi, er, ii in chunk_output: weights[ii] += w variable[ii] += zi errors[ii] += er # create info for the user t1 = timer() if verbose: print("\tchunk {}/{} completed in {:.0f}s".format( c + 1, n_chunks, t1 - t0)) t0 = timer() # completing the interpolation if verbose: print("\nFinishing off interoplation") if detrend: if verbose: print("\tAdding back the trend") zi = (variable / weights) + model.predict(np.c_[xx, yy]) else: if verbose: print("\tAdding back the average") zi = (variable / weights) + z_avg errors = errors / weights if verbose & is_time_x: print("\tTime conversion") xi = (xi * 3600).astype("datetime64[s]") if is_time_x else xi if verbose: print("\tCreating xarray dataset for output") xds = xr.Dataset( attrs={ "description": ( "interpolation output from the GliderTools.interp_obj" "function. Print out mapping_info for more details"), "mapping_info": interp_info, }) props = dict(dims=["y", "x"], coords={"y": yi, "x": xi}) xds["z"] = xr.DataArray(zi.reshape(yi.size, xi.size), **props) xds["weights"] = xr.DataArray(weights.reshape(yi.size, xi.size), **props) xds["variance"] = xr.DataArray(errors.reshape(yi.size, xi.size), **props) xds.attrs["nugget"] = nugget xds.attrs["partial_sill"] = partial_sill dummy = transfer_nc_attrs(getframe(), zvar, zvar, "_interp") if isinstance(zvar, xr.DataArray): xds["z"].attrs = dummy.attrs # xds = xds.rename({'z': dummy.name}) if isinstance(yvar, xr.DataArray): xds["y"].attrs = yvar.attrs xds = xds.rename({"y": yvar.name}) if isinstance(xvar, xr.DataArray): xds["x"].attrs = xvar.attrs xds = xds.rename({"x": xvar.name}) return xds
def grid_data( x, y, var, bins=None, how='mean', interp_lim=6, verbose=True, return_xarray=True, ): """ Grids the input variable to bins for depth/dens (y) and time/dive (x). The bins can be specified to be non-uniform to adapt to variable sampling intervals of the profile. It is useful to use the ``gt.plot.bin_size`` function to identify the sampling intervals. The bins are averaged (mean) by default but can also be the ``median, std, count``, Parameters ---------- x : np.array, dtype=float, shape=[n, ] The horizontal values by which to bin need to be in a psudeo discrete format already. Dive number or ``time_average_per_dive`` are the standard inputs for this variable. Has ``p`` unique values. y : np.array, dtype=float, shape=[n, ] The vertical values that will be binned; typically depth, but can also be density or any other variable. bins : np.array, dtype=float; shape=[q, ], default=[0 : 1 : max_depth ] Define the bin edges for y with this function. If not defined, defaults to one meter bins. how : str, defualt='mean' the string form of a function that can be applied to pandas.Groupby objects. These include ``mean, median, std, count``. interp_lim : int, default=6 sets the maximum extent to which NaNs will be filled. Returns ------- glider_section : xarray.DataArray, shape=[p, q] A 2D section in the format specified by ``ax_xarray`` input. Raises ------ Userwarning Triggers when ``x`` does not have discrete values. """ from pandas import cut, Series from xarray import DataArray from numpy import array, c_, unique, diff xvar, yvar = x.copy(), y.copy() z = Series(var) y = array(y) x = array(x) u = unique(x).size s = x.size if (u / s) > 0.2: raise UserWarning( 'The x input array must be psuedo discrete (dives or dive_time). ' '{:.0f}% of x is unique (max 20% unique)'.format(u / s * 100)) chunk_depth = 50 optimal_bins, avg_sample_freq = get_optimal_bins(y, chunk_depth) if bins is None: bins = optimal_bins # warning if bin average is smaller than average bin size if verbose: avg_bin_size = diff(bins).mean() print(('Mean bin size = {:.2f}\n' 'Mean depth binned ({} m) vertical sampling frequency = {:.2f}' ).format(avg_bin_size, chunk_depth, avg_sample_freq)) labels = c_[bins[:-1], bins[1:]].mean(axis=1) bins = cut(y, bins, labels=labels) grp = Series(z).groupby([x, bins]) grp_agg = getattr(grp, how)() gridded = grp_agg.unstack(level=0) gridded = gridded.reindex(labels.astype(float)) if interp_lim > 0: gridded = gridded.interpolate(limit=interp_lim).bfill(limit=interp_lim) if not return_xarray: return gridded if return_xarray: dummy = transfer_nc_attrs(getframe(), var, var, '_vert_binned') xda = gridded.stack().to_xarray() if isinstance(var, DataArray): xda.attrs = dummy.attrs xda.name = dummy.name if isinstance(yvar, DataArray): y = xda.dims[0] xda[y].attrs = yvar.attrs xda = xda.rename({y: yvar.name}) if isinstance(xvar, DataArray): x = xda.dims[1] xda[x].attrs = xvar.attrs xda = xda.rename({x: xvar.name}) return xda
def grid_data( x, y, var, bins=None, how="mean", interp_lim=6, verbose=True, return_xarray=True, ): """ Grids the input variable to bins for depth/dens (y) and time/dive (x). The bins can be specified to be non-uniform to adapt to variable sampling intervals of the profile. It is useful to use the ``gt.plot.bin_size`` function to identify the sampling intervals. The bins are averaged (mean) by default but can also be the ``median, std, count``, Parameters ---------- x : np.array, dtype=float, shape=[n, ] The horizontal values by which to bin need to be in a psudeo discrete format already. Dive number or ``time_average_per_dive`` are the standard inputs for this variable. Has ``p`` unique values. y : np.array, dtype=float, shape=[n, ] The vertical values that will be binned; typically depth, but can also be density or any other variable. bins : np.array, dtype=float; shape=[q, ], default=[0 : 1 : max_depth ] Define the bin edges for y with this function. If not defined, defaults to one meter bins. how : str, defualt='mean' the string form of a function that can be applied to pandas.Groupby objects. These include ``mean, median, std, count``. interp_lim : int, default=6 sets the maximum extent to which NaNs will be filled. Returns ------- glider_section : xarray.DataArray, shape=[p, q] A 2D section in the format specified by ``ax_xarray`` input. Raises ------ Userwarning Triggers when ``x`` does not have discrete values. """ from numpy import array, c_, diff, unique from pandas import Series, cut from xarray import DataArray xvar, yvar = x.copy(), y.copy() z = Series(var) y = array(y) x = array(x) u = unique(x).size s = x.size if (u / s) > 0.2: raise UserWarning( "The x input array must be psuedo discrete (dives or dive_time). " "{:.0f}% of x is unique (max 20% unique)".format(u / s * 100)) chunk_depth = 50 # -DB this might not work if the user uses anything other than depth, example # density. Chunk_depth would in that case apply to density, which will # probably have a range that is much smaller than 50. optimal_bins, avg_sample_freq = get_optimal_bins(y, chunk_depth) if bins is None: bins = optimal_bins # warning if bin average is smaller than average bin size # -DB this is not being raised as a warning. Instead just seems like useful # information conveyed to user. Further none of this works out if y is not # depth, since avg_sample freq will not make sense otherwise. if verbose: avg_bin_size = diff(bins).mean() print(("Mean bin size = {:.2f}\n" "Mean depth binned ({} m) vertical sampling frequency = {:.2f}" ).format(avg_bin_size, chunk_depth, avg_sample_freq)) labels = c_[bins[:-1], bins[1:]].mean(axis=1) # -DB creates the mean bin values bins = cut(y, bins, labels=labels) # -DB creates a new variable where instead of variable the bin category # is mentioned (sort of like a discretization) grp = Series(z).groupby([x, bins ]) # -DB put z into the many bins (like 2D hist) grp_agg = getattr( grp, how)() # -DB basically does grp.how() or in this case grp.mean() gridded = grp_agg.unstack(level=0) gridded = gridded.reindex(labels.astype(float)) if interp_lim > 0: gridded = gridded.interpolate(limit=interp_lim).bfill(limit=interp_lim) if not return_xarray: return gridded if return_xarray: dummy = transfer_nc_attrs(getframe(), var, var, "_vert_binned") xda = DataArray(gridded) if isinstance(var, DataArray): xda.attrs = dummy.attrs xda.name = dummy.name if isinstance(yvar, DataArray): y = xda.dims[0] xda[y].attrs = yvar.attrs xda = xda.rename({y: yvar.name}) if isinstance(xvar, DataArray): x = xda.dims[1] xda[x].attrs = xvar.attrs xda = xda.rename({x: xvar.name}) return xda
def calc_oxygen( o2raw, pressure, salinity, temperature, auto_conversion=True, spike_window=7, spike_method="median", savitzky_golay_window=0, savitzky_golay_order=2, verbose=True, ): """ This function processes oxygen. It is assumed that either mL/L or umol/kg are passed as input. The units are automatically detected by looking at the mean ratio. Below are some conversions to help with the Oxygen units: >>> µmol/l > µmol/kg * 1.025 µmol/l > ml/l * 44.66 µmol/l > mg/l * 31.25 Parameters ---------- o2raw : array, dtype=float, shape=[n, ] raw oxygen without unit conversion pressure : array, dtype=float, shape=[n, ] salinity : array, dtype=float, shape=[n, ] temperature : array, dtype=float, shape=[n, ] conversion : bool=True tries to determine the unit of oxygen based on ``o2raw`` values. The user needs to do a manual conversion if False spike_window : int=7 rolling window size to apply for the ``cleaning.despike`` function. spike_method : string='median' can be 'median' or 'minmax'. see ``cleaning.despike`` for more info. savitzky_golay_window : int=0 rolling window size for ``cleaning.savitzky_golay`` function savitzky_golay_order : int=2 polynomial order for ``cleaning.savitzky_golay`` function verbose : bool=True Returns ------- o2mll : array, dtype=float, shape=[n, ] oxygen concentration in mL/L (if unit auto_conversion is set True) o2pct : array, dtype=float, shape=[n, ] theoretical oxygen saturation percentage o2aou : array, dtype=float, shape=[n, ] aparent oxygen utilisation based on measured oxygen and oxygen saturation. Note ---- To Do: Oxygen processing should have its own section to be consistent """ import seawater as sw from numpy import abs, array, c_, isnan, median, ones from pandas import Series from .cleaning import despike, outlier_bounds_iqr, savitzky_golay var = o2raw.copy() # metdata preservation if isinstance(o2raw, Series): name = o2raw.name else: name = "Oxygen" o2raw = array(o2raw) pressure = array(pressure) temperature = array(temperature) salinity = array(salinity) if spike_window: o2raw, _ = despike(o2raw, spike_window, spike_method) printv( verbose, "\n" + "=" * 50 + "\n{}:\n" "\tSmoothing data with despiking algorithm:\n\t" " spike identification (spike window={})" "".format(name, spike_window), ) if savitzky_golay_window: printv( verbose, ("\tSmoothing with Savitzky-Golay filter " "(window={}, order={})").format(savitzky_golay_window, savitzky_golay_order), ) o2raw = savitzky_golay(o2raw, savitzky_golay_window, savitzky_golay_order) o2sat = sw.satO2(salinity, temperature) density = sw.dens(salinity, temperature, pressure) if auto_conversion: # use linear regression to determine the oxygen unit # raw surface (<10m) O2 is regressed theoretical saturation # the slope of the regression will be indicative of the # units as theoretical saturation is always in mL/L # Use the min difference between the slope and known # conversion factors to estimate the appropriate conversion. # clean the data first with basic cleaning surf = (pressure < 20) & ~isnan(o2raw) & ~isnan(o2sat) # prepare the data for linear regression Y = o2raw[surf].copy() X = c_[ones(surf.sum()), o2sat[surf]] # removing outliers accodring to IQR ll, ul = outlier_bounds_iqr(Y, multiplier=1.5) m = (Y > ll) & (Y < ul) ratios = Y[m] / X[m, 1] # compare the slopes observed_ratio = median(ratios) # the theoretical values have been divided by 1.025 to account for # the density of seawater theoretic_ratio = array([1, 43.5]) ratio_diffs = abs(observed_ratio - theoretic_ratio) # catch if the difference is too big if ratio_diffs.min() > 10: printv( verbose, ("Oxygen unit could not be estimated automatically. " "Do the unit conversion on the raw data before " "passing it to the function. \n" "Below is some info to help you\n" " µmol/l > µmol/kg * 1.025\n" " µmol/l > ml/l * 44.66\n" " µmol/l > mg/l * 31.25"), ) # otherwise do the conversion else: unit_idx = ratio_diffs.argmin() if unit_idx == 0: unit = "mL/L" o2mll = array(o2raw) elif unit_idx == 2: unit = "mg/L" o2mll = array(o2raw) / 31.25 * (density / 1000) elif unit_idx == 1: unit = "umol/kg" o2mll = array(o2raw) / 44.66 * (density / 1000) else: printv(verbose, "Difference is {}".format(ratio_diffs)) printv(verbose, "\tUnits automatically detected {}".format(unit)) if ratio_diffs.min() > 5: print("\tWARNING: Confirm units mannually as near the " "confidence threshold") o2aou = o2sat - o2mll o2pct = o2mll / o2sat * 100 o2mll = transfer_nc_attrs( getframe(), var, o2mll, "o2mll", units="mL/L", comment="", standard_name="dissolved_oxygen", ) o2aou = transfer_nc_attrs( getframe(), var, o2mll, "o2aou", units="mL/L", comment="", standard_name="aparent_oxygen_utilisation", ) o2pct = transfer_nc_attrs( getframe(), var, o2mll, "o2pct", units="percent", comment="", standard_name="theoretical_oxgen_saturation", ) return o2mll, o2pct, o2aou else: print("No oxygen conversion applied - user " "must impliment before or after running " "the cleaning functions.")