/
SeaGlider_utils.py
493 lines (400 loc) · 16.3 KB
/
SeaGlider_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
UAGE: from SeaGlider_utils import SeaGlider
sg = SeaGlider('/path/to/folder/')
For more info see help(SeaGlider)
written by Luke Gregor (lukegre@gmail.com)
"""
from __future__ import print_function as _pf
from netCDF4 import Dataset
from pandas import DataFrame, Series
import numpy as np
import warnings as _warnings
_warnings.filterwarnings("ignore", category=FutureWarning)
_warnings.filterwarnings("ignore", category=RuntimeWarning)
class SeaGlider:
def __init__(self, expidition_directory, config_dict=None):
from glob import glob
dt64 = np.datetime64
# setting up standard locs
self.directory = expidition_directory
self.files = np.sort(glob(self.directory))
# loading data for dates and dummy vars
nc0 = Dataset(self.files[0])
nc1 = Dataset(self.files[-1])
# creating dimensions where data is stored
dims = np.array(list(nc0.dimensions.keys()))
dims = dims[np.array([not d.startswith('string') for d in dims])]
self.data = {}
for key in dims:
self.data[key] = DataFrame()
self.data[key].var_count = 0
self.data[key].dimension = key
for key in nc0.variables:
dim = nc0.variables[key].dimensions
# dims can be a valid dimension, () or string_n
if dim: # catch empty tuples
# there are dimensions that are just `string_n` placeholders
if dim[0].startswith('string'): # SeaGliderPointVariable
var_obj = _SeaGliderPointVariable(key, self.files)
else: # SeaGliderDiveVariable
dim_df = self.data[dim[0]]
dim_df.var_count += 1
var_obj = _SeaGliderDiveVariable(key, self.files, dim_df)
else: # SeaGliderPointVariable
var_obj = _SeaGliderPointVariable(key, self.files)
# assign as an object of SeaGlider
setattr(self, key, var_obj)
t0 = dt64(nc0.getncattr('time_coverage_start').replace('Z', ''))
t1 = dt64(nc1.getncattr('time_coverage_end').replace('Z', ''))
self.date_range = np.array([t0, t1], dtype='datetime64[s]')
nc0.close()
nc1.close()
def __getitem__(self, key):
from xarray import open_dataset
if type(key) == int:
fname = self.files[key]
nco = open_dataset(fname)
return nco
elif type(key) == list:
if all([type(k) is str for k in key]):
self._load_multiple_vars(key)
else:
return "All arguements must be strings"
elif type(key) == str:
return getattr(self, key)
else:
return "Indexing with {} does not yet exist".format(key)
def _load_multiple_vars(self, keys):
# TO DO
# This will load multple keys.
# 1. find the dimensions for each of the keys and group accordingly
# 2. iterate through groups and import with _read_nc_files
# 3. assign each group to the appropriate dimension_dict
pass
def __repr__(self):
dims = self.data
dim_keys = list(dims.keys())
string = ""
string += "Number of dives: {}\n".format(self.files.size)
string += "Date range: {} to {}".format(
self.date_range[0], self.date_range[1]).replace('T', ' ')
string += "\n"
key = dim_keys[0]
string += "Dimensions: access these DataFrames in `.data[dim_name]`\n"
for key in dim_keys:
string += " • {} ({})\n".format(key, dims[key].var_count)
string += "\n"
string += "Access all variables directly from this object `.variable_name`\n"
string += "[press TAB to autocomplete]"
string += '\n\n'
return string
class _SeaGliderDiveVariable:
def __init__(self, name, files, dimension_df):
nco = Dataset(files[0])
self.__data__ = dimension_df
self.__gridded__ = None
self.__files__ = files
self.name = name
self.attrs = dict(nco.variables[name].__dict__)
if 'coordinates' in self.attrs:
coordinates = self.attrs['coordinates'].split() + ['dives']
self.__data__.coordinates = coordinates
elif not hasattr(self.__data__, 'coordinates'):
self.__data__.coordinates = []
self.__data__.bins = np.arange(1000.)
nco.close()
def __getitem__(self, key):
data = self.load(return_data=True)
return data.loc[key]
@property
def gridded(self):
if self.__gridded__ is None:
self.__gridded__ = self.bin_depths()
return self.__gridded__
@property
def data(self):
return self.load(return_data=True)
@property
def series(self):
self.load()
return self.__data__.loc[:, self.name]
@property
def values(self):
self.load()
return self.__data__.loc[:, self.name].values
def load(self, return_data=False):
# neaten up the script by creating labels
data = self.__data__
keys = np.unique(data.coordinates + [self.name])
files = self.__files__
# get keys not in dataframe
missing = [k for k in filter(lambda k: k not in data, keys)]
if any(missing):
df = self._read_nc_files(files, missing)
# process coordinates - if no coordinates, just loops through
df = self._process_coords(df, files[0])
for col in df:
# inplace column creation
data[col] = df[col]
if return_data:
return data.loc[:, keys]
def __repr__(self):
string = ""
string += "Variable: {}\n".format(self.name)
string += "Number of dives: {}\n".format(self.__files__.size)
string += "Dimension: {}\n".format(self.__data__.dimension)
string += "Attributes:\n"
for key in self.attrs:
string += ' {0: <18}{1}\n'.format(key.capitalize() + ":", self.attrs[key])
string += '\n'
if self.name in self.__data__:
string += "Data is not interpolated/binned \n"
string += self.data.head().__repr__()
string += "\n{} measurements".format(self.data.shape[0])
else:
string += 'Data not loaded yet [.data]'
return string
def _read_nc_files(self, files, keys):
from tqdm import trange
if 'dives' in keys:
dives = True
keys.remove('dives')
else:
dives = False
data = []
for i in trange(files.size):
fname = files[i]
nc = Dataset(fname)
if any([k not in nc.variables for k in keys]):
message = "{} is missing some variables".format(fname)
print(message)
continue
arr = np.r_[[nc.variables[k][:] for k in keys]]
if dives:
arr = np.r_[arr, np.ones([1, arr.shape[1]]) * i]
nc.close()
data += arr.T,
data = np.concatenate(data)
cols = list(keys) + (['dives'] if dives else [])
df = DataFrame(data, columns=cols)
return df
def _process_coords(self, df, reference_file_name):
# TRY TO GET DEPTH AND TIME COORDS AUTOMATICALLY
for col in df.columns:
# DECODING TIMES IF PRESENT
if ('time' in col.lower()) | ('_secs' in col.lower()):
time = col
self.__data__.time_name = time
nco = Dataset(reference_file_name)
units = nco.variables[time].getncattr('units')
df[time + '_raw'] = df.loc[:, time].copy()
if 'seconds since 1970' in units:
df[time] = df.loc[:, time].astype('datetime64[s]')
else:
from xarray.conventions import decode_cf_datetime
df[time] = decode_cf_datetime(df.loc[:, time], units)
nco.close()
# INDEXING DIVES IF DEPTH PRESENT
if 'depth' in col.lower():
depth = col
self.__data__.depth_name = col
# INDEX UP AND DOWN DIVES
grp = df.groupby('dives')
dmax = grp[depth].apply(np.nanargmax)
idx = [grp.groups[i][dmax[i]] for i in grp.groups]
# create a dummy index 'up' that marks max depth
df['up'] = 0
df.loc[idx, 'up'] = 1
df['up'] = df.up.cumsum()
# average 'dives' and 'up' for a dive index
df['dives'] = (df['dives'] + df['up']) / 2.
df = df.drop('up', axis=1)
return df
def bin_depths(self, bins=None, how='mean', depth=None):
"""
This function bins the variable to set depths.
If no depth is specified it defaults to:
start : step : end
------------------
0 : 0.5 : 100
100 : 1.0 : 400
400 : 2.0 : max
------------------
This is the sampling method typically used by the CSIR.
This can be easily changed by specifying an array of increasing values.
The function used to bin the data can also be set, where the default
is the mean. This can be changed to 'median', 'std', 'count', etc...
Lastly, the data is stored as SeaGliderVariable.gridded for future
easy access. If you would like to regrid the data to another grid
use SeaGliderVariable.bin_depths().
"""
from pandas import cut
from scipy.stats import mode
# load the data if this has not been done
self.load()
data = self.__data__
if depth is None:
if hasattr(data, 'depth_name'):
depth = data.depth_name
else:
print('please provide depth column name as a kwarg')
return None
# assigning a bin value and make that the default for dimension
bins = data.bins if bins is None else bins
data.bins = bins
# The default bins only go to 1000m.
# Deepen with step same as last 10 values if needed
if bins.max() < self.data[depth].max():
step = mode(np.diff(bins[-10:])).mode
start = bins[-1] + step
stop = self.data[depth].max() + step
extended_bins = np.arange(start, stop, step)
bins = np.r_[bins, extended_bins]
#
labels = (bins[:-1] + bins[1:]) / 2.
dives = self.data.dives.values
bins = cut(self.data[depth], bins, labels=labels)
grp_a = self.data.groupby([dives, bins])
grp_agg = getattr(grp_a[self.name], how)()
gridded = grp_agg.unstack(level=0)
if hasattr(data, 'time_name'):
grp_b = self.data[depth].groupby(dives)
idx = grp_b.apply(lambda s: s.index[np.nanargmin(s)])
gridded.columns = data.loc[idx, data.time_name]
gridded.columns.name = 'surface_time'
if gridded.index.dtype.name == 'category':
index = gridded.index.astype(float)
gridded = gridded.set_index(index)
gridded.index.name = 'depth_bin_centre'
self.__gridded__ = gridded
return gridded
def scatter(self, **kwargs):
"""
Plot a scatter plot of the dives with x-time and y-depth and
c-variable.
The **kwargs can be anything that gets passed to plt.scatter.
Note that the colour is scaled to 1 and 99% of z.
"""
from matplotlib.pyplot import scatter, colorbar, subplots
self.load()
time = getattr(self.__data__, 'time_name', None)
if time is not None:
x = self.data.set_index(time).index.to_pydatetime()
else:
x = self.data.dives
y = self.data[self.__data__.depth_name]
z = np.ma.masked_invalid(self.data[self.name])
if "vmin" not in kwargs:
kwargs['vmin'] = np.percentile(z[~z.mask], 1)
if "vmax" not in kwargs:
kwargs['vmax'] = np.percentile(z[~z.mask], 99)
if 'linewidths' not in kwargs:
kwargs['linewidths'] = 0
fig, ax = subplots(1, 1, figsize=[11, 4])
im = scatter(x, y, 20, z, **kwargs)
cb = colorbar(mappable=im, pad=0.02, ax=ax)
ax.set_xlim(x.min(), x.max())
ax.set_ylim(y.max(), y.min())
ax.set_ylabel('Depth (m)')
ax.set_xlabel('Date')
vname = self.name.capitalize()
units = '({})'.format(
self.attrs['units']) if 'units' in self.attrs else ''
clabel = '{} {}'.format(vname, units)
cb.set_label(clabel, rotation=-90, va='bottom')
[tick.set_rotation(45) for tick in ax.get_xticklabels()]
fig.tight_layout()
return ax
def pcolormesh(self, interpolate_dist=0, **kwargs):
"""
Plot a section plot of the dives with x-time and y-depth and
z-variable. The data can be linearly interpolated to fill missing
depth values. The number of points to interpolate can be set with
interpolate_dist.
The **kwargs can be anything that gets passed to plt.pcolormesh.
Note that the colour is scaled to 1 and 99% of z.
"""
from matplotlib.pyplot import pcolormesh, colorbar, subplots
if interpolate_dist:
gridded = self.gridded.interpolate(limit=interpolate_dist)
else:
gridded = self.gridded
y = gridded.index.values
x = gridded.columns.values
z = np.ma.masked_invalid(gridded)
if "vmin" not in kwargs:
kwargs['vmin'] = np.percentile(z[~z.mask], 1)
if "vmax" not in kwargs:
kwargs['vmax'] = np.percentile(z[~z.mask], 99)
fig, ax = subplots(1, 1, figsize=[11, 4])
im = pcolormesh(x, y, z, **kwargs)
cb = colorbar(mappable=im, pad=0.02, ax=ax)
ax.set_xlim(x.min(), x.max())
ax.set_ylim(y.max(), y.min())
ax.set_ylabel('Depth (m)')
ax.set_xlabel('Date')
vname = self.name.capitalize()
units = '({})'.format(
self.attrs['units']) if 'units' in self.attrs else ''
clabel = '{} {}'.format(vname, units)
cb.set_label(clabel, rotation=-90, va='bottom')
[tick.set_rotation(45) for tick in ax.get_xticklabels()]
fig.tight_layout()
return ax
class _SeaGliderPointVariable:
def __init__(self, name, files):
nco = Dataset(files[0])
self.__files__ = files
self.__data__ = None
self.name = name
self.attrs = dict(nco.variables[name].__dict__)
nco.close()
@property
def data(self):
if self.__data__ is None:
self.load()
return self.__data__
def load(self):
if self.__data__ is None:
df = self._read_nc_files(self.__files__, self.name)
try:
self.__data__ = df.astype(float)
except ValueError:
self.__data__ = df
def __repr__(self):
string = ""
string += "Variable: {}\n".format(self.name)
string += "Number of dives: {}\n".format(self.__files__.size)
string += "Attributes:\n"
if self.attrs:
for key in self.attrs:
string += ' {0: <18}{1}\n'.format(key.capitalize() + ":", self.attrs[key])
else:
string += " No attributes for this variable\n"
string += '\n'
if self.__data__ is not None:
string += "Data is not interpolated/binned \n"
string += self.data.head().__repr__()
string += "\n{} measurements".format(self.data.shape[0])
else:
string += 'Data not loaded yet [.data]'
return string
def _read_nc_files(self, files, key):
from tqdm import trange
data = []
for i in trange(files.size):
fname = files[i]
nc = Dataset(fname)
arr = nc.variables[key][:].squeeze()
if arr.size == 1:
pass
else:
arr = ''.join(arr.astype(str))
nc.close()
data += arr,
df = Series(np.array(data), name=key)
return df
if __name__ == '__main__':
pass