def time_corr(date: Union[pd.Series, pd.Index, np.ndarray], cfg_in: Mapping[str, Any], sort: Union[str, bool, None] = None, path_save_image='time_corr'): """ :param date: numpy np.ndarray elements may be datetime64 or text in ISO 8601 format :param cfg_in: dict with fields: - dt_from_utc: correct time by adding this constant - fs: sampling frequency - sort: same as :param sort:, used only if :param sort: is None - keep_input_nans: NaNs in date remains unchanged - path: where save images of bad time corrected - min_date, min_date: optional limits - to set out time beyond limits to constants slitly beyond limits :param sort: - 'increase' or 'True' or True: increase duplicated time values, (increase time resolution) - 'False', False: do not check time inversions - 'delete_inversions' :return: (tim, b_ok) where - tim: pandas time series, same size as date input - b_ok: mask of not decreasing elements Note: converts to UTC time if ``date`` in text format, properly formatted for conv. todo: use Kalman filter? """ if not date.size: return pd.DatetimeIndex([], tz='UTC'), np.bool_([]) if sort is None: sort = cfg_in.get('sort') if sort == 'False': sort = False elif sort == 'True' or sort == 'increase': sort = True if __debug__: lf.debug('time_corr (time correction) started') if cfg_in.get('dt_from_utc'): if isinstance(date[0], str): # add zone that compensate time shift hours_from_utc_f = cfg_in['dt_from_utc'].total_seconds() / 3600 Hours_from_UTC = int(hours_from_utc_f) hours_from_utc_f -= Hours_from_UTC if abs(hours_from_utc_f) > 0.0001: print('For string data can add only fixed number of hours! Adding', Hours_from_UTC / 3600, 'Hours') tim = pd.to_datetime((date.astype(np.object) + '{:+03d}'.format(Hours_from_UTC)).astype('datetime64[ns]'), utc=True) elif isinstance(date, pd.Index): tim = date tim -= cfg_in['dt_from_utc'] tim = tim.tz_localize('UTC') # if Hours_from_UTC != 0: # tim.tz= tzoffset(None, -Hours_from_UTC*3600) #invert localize # tim= tim.tz_localize(None).tz_localize('UTC') #correct else: try: if isinstance(date, pd.Series): tim = pd.to_datetime(date - np.timedelta64(cfg_in['dt_from_utc']), utc=True) else: tim = pd.to_datetime(date.astype('datetime64[ns]') - np.timedelta64( pd.Timedelta(cfg_in['dt_from_utc'])), utc=True) # hours=Hours_from_UTC except OverflowError: # still need?? tim = pd.to_datetime(datetime_fun( np.subtract, tim.values, np.timedelta64(cfg_in['dt_from_utc']), type_of_operation='<M8[ms]' ), utc=True) # tim += np.timedelta64(pd.Timedelta(hours=hours_from_utc_f)) #? lf.info('Time constant: {} {:s}', abs(cfg_in['dt_from_utc']), 'subtracted' if cfg_in['dt_from_utc'] > timedelta(0) else 'added') else: if (not isinstance(date, pd.Series)) and (not isinstance(date, np.datetime64)): date = date.astype('datetime64[ns]') tim = pd.to_datetime(date, utc=True) # .tz_localize('UTC')tz_convert(None) #hours_from_utc_f = 0 cfg_min_date = cfg_in.get('min_date') if cfg_min_date: cfg_min_date = pd.Timestamp(cfg_in['min_date'], tz='UTC') # Skip processing if data is out of filtering range global tim_min_save, tim_max_save tim_min = tim.min(skipna=True) tim_max = tim.max(skipna=True) # also collect statistics of min&max for messages: tim_min_save = min(tim_min_save, tim_min) tim_max_save = max(tim_max_save, tim_max) # set time beyond limits to special values keeping it sorted for dask and mark out of range as good values if tim_max < cfg_min_date: tim[:] = cfg_min_date - np.timedelta64(1, 'ns') # pd.NaT # ns-resolution maximum year return tim, np.ones_like(tim, dtype=bool) else: cfg_max_date = cfg_in.get('max_date') if cfg_max_date: cfg_max_date = pd.Timestamp(cfg_in['max_date'], tz='UTC') if tim_min > cfg_max_date: tim[:] = pd.Timestamp(cfg_in['max_date'], tz='UTC') + np.timedelta64(1, 'ns') # pd.Timestamp('2262-01-01') # ns-resolution maximum year return tim, np.ones_like(tim, dtype=bool) b_ok_in = tim >= cfg_min_date if cfg_max_date: b_ok_in &= (tim <= cfg_max_date) it_se = np.flatnonzero(b_ok_in)[[0,-1]] it_se[1] += 1 tim = tim[slice(*it_se)] b_ok_in = tim.notna() n_bad_in = b_ok_in.size - b_ok_in.sum() if n_bad_in: if cfg_in.get('keep_input_nans'): tim = tim[b_ok_in] try: b_ok_in = b_ok_in.to_numpy() except AttributeError: pass # we already have numpy array t = tim.to_numpy(np.int64) if sort and tim.size > 1: # Check time resolution and increase if needed to avoid duplicates if n_bad_in and not cfg_in.get('keep_input_nans'): t = np.int64(rep2mean(t, bOk=b_ok_in)) b_ok_in[:] = True freq, n_same, n_decrease, i_different = find_sampling_frequency(t, precision=6, b_show=False) if freq: cfg_in['fs_last'] = freq # fallback freq to get value for next files on fail elif cfg_in['fs_last']: lf.warning('Using fallback (last) sampling frequency fs = {:s}', cfg_in['fs_last']) freq = cfg_in['fs_last'] elif cfg_in.get('fs'): lf.warning('Ready to use specified sampling frequency fs = {:s}', cfg_in['fs']) freq = cfg_in['fs'] elif cfg_in.get('fs_old_method'): lf.warning('Ready to use specified sampling frequency fs_old_method = {:s}', cfg_in['fs_old_method']) freq = cfg_in['fs_old_method'] else: lf.warning('Ready to set sampling frequency to default value: fs = 1Hz') freq = 1 # # show linearity of time # plt.plot(date) # fig, axes = plt.subplots(1, 1, figsize=(18, 12)) # t = date.values.view(np.int64) # t_lin = (t - np.linspace(t[0], t[-1], len(t))) # axes.plot(date, / dt64_1s) # fig.savefig(os_path.join(cfg_in['dir'], cfg_in['file_stem'] + 'time-time_linear,s' + '.png')) # plt.close(fig) b_ok = None idel = None msg = '' if n_decrease > 0: # Excude elements # if True: # # try fast method # b_bad_new = True # k = 10 # while np.any(b_bad_new): # k -= 1 # if k > 0: # b_bad_new = b1spike(t[b_ok], max_spike=2 * np.int64(dt64_1s / freq)) # b_ok[np.flatnonzero(b_ok)[b_bad_new]] = False # print('step {}: {} spikes found, deleted {}'.format(k, np.sum(b_bad_new), # np.sum(np.logical_not(b_ok)))) # pass # else: # break # if k > 0: # success? # t = rep2mean(t, bOk=b_ok) # freq, n_same, n_decrease, b_same_prev = find_sampling_frequency(t, precision=6, b_show=False) # # print(np.flatnonzero(b_bad)) # else: # t = tim.values.view(np.int64) # if n_decrease > 0: # fast method is not success # take time:i # lf.warning(Fast method is not success) # Excluding inversions # find increased elements (i_different is i_inc only if single spikes): i_inc = i_different[longest_increasing_subsequence_i(t[i_different])] # try trusting to repeating values, keeping them to not interp near holes (else use np.zeros): dt = np.ediff1d(t, to_end=True) b_ok = dt == 0 b_ok[i_inc] = True # b_ok= nondecreasing_b(t, ) # t = t[b_ok] t_ok = t[b_ok] i_dec = np.flatnonzero(np.ediff1d(t_ok, to_end=True) < 0) n_decrease_remains = len(i_dec) if n_decrease_remains: lf.warning('Decreased time among duplicates ({:d} times). Not trusting repeated values...', n_decrease_remains) b_ok = np.zeros_like(t, dtype=np.bool_) b_ok[i_inc] = True if sort == 'delete_inversions': # selecting one of the two bad time values that lead to the bad diff element and mask these elements for s, e in i_dec + np.int32([0, 1]): b_ok[t == (t_ok[e if b_ok[s] else s])] = False if cfg_in.get('keep_input_nans'): (b_ok_in[b_ok_in])[~b_ok] = False else: b_ok_in[~b_ok] = False else: # Decreased time not in duplicates i_dec = np.delete(i_different, np.searchsorted(i_different, i_inc)) assert np.alltrue(i_dec == i_different[~np.in1d(i_different, i_inc)]) # same results # assert np.alltrue(i_dec == np.setdiff1d(i_different, i_inc[:-1])) # same results if sort == 'delete_inversions': b_ok_in[np.flatnonzero(b_ok_in)[i_dec] if cfg_in.get('keep_input_nans') else i_dec] = False b_ok[b_ok] = np.ediff1d(t[b_ok], to_end=True) > 0 # adaption for next step idel = np.flatnonzero(~b_ok) n_del = len(idel) msg = f"Filtered time: {n_del}/{t.size} values " \ f"{'masked' if sort == 'delete_inversions' else 'interpolated'} (1st and last: " \ f"{pd.to_datetime(t[idel[[0, -1]]], utc=True)})" if n_decrease: lf.warning('decreased time ({}) was detected! {}', n_decrease, msg) else: lf.warning(msg) if n_same > 0 and cfg_in.get('fs') and not cfg_in.get('fs_old_method'): # This is most simple operation that should be done usually for CTD t = repeated2increased(t, cfg_in['fs'], b_ok if n_decrease else None) # if n_decrease then b_ok is calculated before tim = pd.to_datetime(t, utc=True) elif n_same > 0 or n_decrease > 0: # message with original t # Replace t by linear increasing values using constant frequency excluding big holes if cfg_in.get('fs_old_method'): lf.warning('Linearize time interval using povided freq = {:f}Hz (determined: {:f})', cfg_in.get('fs_old_method'), freq) freq = cfg_in.get('fs_old_method') else: # constant freq = filtered mean lf.warning('Linearize time interval using median* freq = {:f}Hz determined', freq) t = np.int64(rep2mean(t, bOk=b_ok)) # interp to can use as pandas index even if any bad b_show = n_decrease > 0 if freq <= 1: # Skip: typically data resolution is sufficient for this frequency lf.warning('Not linearizing for frequency < 1') else: # Increase time resolution by recalculating all values tim_before = pd.to_datetime(t, utc=True) make_linear(t, freq) # changes t (and tim?) # Check if we can use them bbad = check_time_diff(tim_before, t.view('M8[ns]'), dt_warn=pd.Timedelta(minutes=2), mesage='Big time diff after corr: difference [min]:') if np.any(bbad): b_ok = ~bbad b_show = True # Show what is done if b_show: if b_ok is None: dt = np.ediff1d(t, to_begin=1) b_ok = dt > 0 plot_bad_time_in_thread(cfg_in, t, b_ok, idel, tim, (tim_min, tim_max) if cfg_in.get('min_date') else None, path_save_image, msg) # Checking all is ok dt = np.ediff1d(t, to_begin=1) b_ok = dt > 0 # tim.is_unique , len(np.flatnonzero(tim.duplicated())) b_decrease = dt < 0 # with set of first element as increasing n_decrease = b_decrease.sum() if n_decrease > 0: lf.warning( 'Decreased remaining time ({:d}) are masked!{:s}{:s}', n_decrease, '\n'.join(' < '.join('{:%y.%m.%d %H:%M:%S.%f%z}'.format(_) for _ in tim[se].to_numpy()) for se in np.flatnonzero(b_decrease)[:3, None] + np.int32([-1, 0])), '...' if n_decrease > 3 else '' ) b_ok &= ~b_decrease b_same_prev = np.ediff1d(t, to_begin=1) == 0 # with set of first element as changing n_same = b_same_prev.sum() if cfg_in.get('keep_input_nans'): if n_same > 0: lf.warning('nonincreased time ({:d} times) is detected! - interp ', n_same) else: # prepare to interp all nonincreased (including NaNs) if n_bad_in: b_same_prev &= ~b_ok_in msg = ', '.join( f'{fault} time ({n} times)' for (n, fault) in ((n_same, 'nonincreased'), (n_bad_in, 'NaN')) if n > 0 ) if msg: lf.warning('{:s} is detected! - interp ', msg) if n_same > 0 or n_decrease > 0: # rep2mean(t, bOk=np.logical_not(b_same_prev if n_decrease==0 else (b_same_prev | b_decrease))) b_bad = b_same_prev if n_decrease == 0 else (b_same_prev | b_decrease) t = rep2mean_with_const_freq_ends(t, ~b_bad, freq) else: lf.debug('time not need to be sorted') b_ok = np.ones(tim.size, np.bool8) # make initial shape: paste back NaNs if n_bad_in and cfg_in.get('keep_input_nans'): # place initially bad elements back t, t_in = (np.NaN + np.empty_like(b_ok_in)), t t[b_ok_in] = t_in b_ok_in[b_ok_in] = b_ok b_ok = b_ok_in elif sort == 'delete_inversions': b_ok &= b_ok_in # make initial shape: pad with constants of config. limits where data was removed because input is beyond this limits if cfg_in.get('min_date') and np.any(it_se != np.int64([0, date.size])): pad_width = (it_se[0], date.size - it_se[1]) t = np.pad(t, pad_width, constant_values=np.array((cfg_in['min_date'], cfg_in['max_date']), 'M8[ns]')) b_ok = np.pad(b_ok, pad_width, constant_values=True) assert t.size == b_ok.size return pd.to_datetime(t, utc=True), b_ok
def CTDrunsExtract(P: np.ndarray, dnT: np.ndarray, cfg_extract_runs: Dict[str, Any]) -> np.ndarray: ''' find profiles ("Mainas"). Uses extractRuns() :param P: Pressure/Depth :param dnT: Time :param cfg_extract_runs: settings dict with fields: - dt_between_min - min_dp - min_samples - dt_hole_max - split runs where dt between adjasent samples bigger. If not specified it is set equal to 'dt_between_min' automatically - b_do - if it is set to False intepret all data as one run - b_keep_minmax_of_bad_files, optional - keep 1 min before max and max of separated parts of data where movements insufficient to be runs :return: iminmax: 2D numpy array np.int64([[minimums],[maximums]]) ''' if ('do' not in cfg_extract_runs ) or cfg_extract_runs['b_do']: # not do only if b_do is set to False P = np.abs(rep2mean(P)) if not 'dt_hole_max' in cfg_extract_runs: cfg_extract_runs['dt_hole_max'] = cfg_extract_runs[ 'dt_between_min'] dt64_hole_max = np.timedelta64(cfg_extract_runs['dt_hole_max'], 'ns') # time_holes= np.flatnonzero(np.ediff1d(dnT, dt64_hole_max, dt64_hole_max) >= dt64_hole_max) #bug in numpy time_holes = np.hstack( (0, np.flatnonzero(np.diff(dnT) >= dt64_hole_max), len(dnT))) imin = [] imax = [] i_keep_bad_runs = [] # for ist, ien in zip(time_holes[:-1], time_holes[1:]): islice = slice(ist, ien) if (ien - ist) < cfg_extract_runs['min_samples']: continue if (P[islice].max() - P[islice].min()) < cfg_extract_runs['min_dp']: if cfg_extract_runs.get('b_keep_minmax_of_bad_files'): i_keep_bad_runs.append(len(imax)) imax.append(P[islice].argmax()) imin.append(P[ist:imax[-1]].argmin()) else: if 'path_images' in cfg_extract_runs: cfg_extract_runs['path_image'] = os_path.join( cfg_extract_runs['path_images'], 'extract_runs{:%y%m%d_%H%M%S}'.format( np.datetime64(dnT[ist], 's').astype(datetime))) + '.png' [it, il] = extractRuns(-P[islice], cfg_extract_runs) # Correct extractRuns func (mins and maxs must alternates): # make 1st min be less than 1st max if it and il: if il[0] < it[0]: del il[0] # make length of min and max be equal if len(it) > len(il): del it[-1] il.append(ien - ist - 1) elif len(it) < len(il): if it and it[0] > il[0]: del il[0] else: it.append(ien - ist - 1) imin.extend([i + ist for i in it]) imax.extend([i + ist for i in il]) # Filter run down intervals: if len(imin): iminmax = np.vstack((imin, imax)) bok = np.logical_and( np.diff(iminmax, 1, 0) >= cfg_extract_runs['min_samples'], np.diff(P[iminmax], 1, 0) >= cfg_extract_runs['min_dp']).flatten() bok[i_keep_bad_runs] = True if ~np.all(bok): iminmax = iminmax[:, bok] else: l.warning('no runs!') return np.int64([[], []]) # N= min(len(imax), len(imin)) # iminMax = [imin, imax] else: # N= 0 iminmax = np.int64([[0], [len(P)]]) # # make mask with ends set to -1 # b_maina = np.zeros(len(P), 'int8') # for k in range(N): # b_maina[imin[k]:imax[k]] = 1 # b_maina[imax] = -1 # Runs.PMax= P(imax) return iminmax # , b_maina
# end = ' ' return b_bad else: return None for col in cols_int16: b_bad = bad_and_message(data=df[col], fun_bad=np.isnan, msg_bad='nan vaues') b_bad2 = bad_and_message(data=df[col], fun_bad=np.isinf, msg_bad='inf vaues') if b_bad is None: if b_bad2 is None: continue b_bad = b_bad2 elif b_bad2 is not None: b_bad |= b_bad2 df[col] = rep2mean(df[col], np.logical_not(b_bad), df.index.astype('i8').astype('f8')) df = df.astype(cfg['out']['dtype'], copy=False) # @+node:korzh.20180521171338.1: ** save def change_db_path(cfg, str_old='Pres.h5', str_new=',P(cal0605).h5'): if not cfg['db_path'].endswith(str_new): cfg['db_path'] = cfg['db_path'][:-len(str_old)] + str_new change_db_path(cfg['out']) log = {} try: # set chanks to mean data interval between holes cfg['out']['chunksize'] = int(mean_burst_size) # np.median(np.diff(i_burst[:-1])) except ValueError: # some default value if no holes