def align_datetime(data, dim='time', plev='plev', times=(0, 12), span=6, freq='12h', **kwargs): """ Standardize datetime to times per date, try to fill gaps Args: data (DataArray, Dataset): Input data dim (str): datetime dimension plev (str): pressure level dimension times (tuple): sounding times span (int): plus minus times (smaller than freq/2) freq (str): frequency of output times Returns: xarray.DataArray : datetime standardized DataArray """ import numpy as np from pandas import DatetimeIndex from xarray import DataArray, Dataset from .. import fun as ff if not isinstance(data, (DataArray, Dataset)): raise ValueError('Requires a DataArray or Dataset', type(data)) if dim not in data.dims: raise ValueError('Requires a datetime dimension', dim) if int(24 / (len(times) * 2)) < span: raise ValueError("Times and span do not agree!?", times, span) if int(24 / int(freq[:-1])) != len(times): raise ValueError("Times and freq do not match:", times, freq) if span > int(freq[:-1]) // 2: raise ValueError( "Frequency and Span need to be consistent (span < freq/2): ", freq, span) dates = data[dim].values.copy() # # Count levels per date # _fix_datetime = np.vectorize(ff.cal.fix_datetime) newdates = _fix_datetime(dates, span=span) # (time: 33%) resolution = np.zeros(newdates.size) # # check for duplicates in standard launch times # u, c = np.unique(newdates, return_counts=True) conflicts = u[c > 1] if conflicts.size > 0: counts = _count_data(data, dim=dim, plev=plev) ff.message("Conflicts", conflicts.size, newdates.size, **kwargs) for i in conflicts: indices = np.where(newdates == i)[0] # numbers (time: 45%) # # Count available data (DataArray or Dataset) # # slow # counts = data.isel(**{dim: indices}).count(plev).values # counts = _count_data(data.isel(**{dim: indices}), dim=dim, plev=plev) # slow end icounts = counts[indices] # # offsets to standard launch time # offset = np.abs((dates[indices] - i) / np.timedelta64(1, 'h')) j = np.argsort(offset) # sort time offsets (first we want) jmax = np.argmax( icounts[j]) # check if counts from other time is larger if jmax != 0: # # there is a sounding with more level data (+/- 1 hour) # if (offset[j][0] + 1) <= offset[j][jmax]: # ok close enough jj = j.copy() jj[j == 0] = jmax # first pos is now at the position of the maximum jj[j == jmax] = 0 # maximum is now first j = jj # # there is a sounding with + 2 more levels # elif (icounts[j][0] + 2) <= icounts[j][jmax]: # a lot more jj = j.copy() jj[j == 0] = jmax # first pos is now at the position of the maximum jj[j == jmax] = 0 # maximum is now first j = jj else: pass # keep time sorting for m, k in enumerate(offset[j]): if m == 0: continue # this is the minimum # change back the others or add a delay to remove duplicates if k == 0: newdates[indices[j][m]] += np.timedelta64( 1, 'h') # add offset resolution[indices[j][m]] = 1 # add hour else: newdates[indices[j][m]] = dates[indices[j] [m]] # revert back resolution[indices[j][m]] = -1 # revert back # # recheck for standard times # idx_std = DatetimeIndex(newdates).hour.isin(times) u, c = np.unique(newdates[idx_std], return_counts=True) # check only standard times conflicts = u[c > 1] ff.message("Conflicts remain:", conflicts.size, idx_std.sum(), newdates.size) # # new dates / new object # data = data.assign_coords({dim: newdates}) # # delay # nn = (resolution > 0).sum() nx = (~idx_std).sum() data['delay'] = (dim, ((dates - newdates) / np.timedelta64(1, 'h')).astype(int) ) # new coordinate for delays data.attrs['std_times'] = str(times) data['delay'].attrs['updated'] = nn data['delay'].attrs['missing'] = data['delay'].isnull().sum().values data['delay'].attrs['times'] = str(times) data['flag_stdtime'] = (dim, resolution.astype(int)) data['flag_stdtime'].attrs.update({ 'units': '1', 'standard_name': 'flag_standard_time_conflict_resolution', 'info': '0: preferred, -1: lesser candidate, 1: duplicate, less data' }) ff.message('Updated [', nn, "] No Standard [", nx, "] [", newdates.size, "]", **kwargs) if not all(data[dim].values == np.sort(data[dim].values)): ff.message("Sorting by", dim, **kwargs) data = data.sortby(dim) return data