def check_spikes(data, chunk_size='2min', detrend=True, detrend_kw={'how':'linear'}, visualize=False, vis_col=1, max_consec_spikes=3, cut_func = lambda x: (abs(x - x.mean()) > 5.*x.std()), replace_with='interpolation', max_percent=1.): """ Applies spikes-check according to Vickers and Mahrt (1997) Parameters ---------- data: pandas.dataframe data to de-spike chunk_size: str, int size of chunks to consider. If str should be pandas offset string. If int, number of lines. detrend: bool whether to detrend the data and work with the fluctuations or to work with the absolute series. detrend_kw: dict dict of keywords to pass to pymicra.trend in order to detrend data (if detrend==True). visualize: bool whether of not to visualize the interpolation ocurring vis_col: str, int or list the column(s) to visualize when seeing the interpolation (only effective if visualize==True) max_consec_spikes: int maximum number of consecutive spikes to actually be considered spikes and substituted cut_func: function function used to define spikes replace_with: str method to use when replacing spikes. Options are 'interpolation' or 'trend'. max_percent: float maximum percentage of spikes to allow. """ import pandas as pd import algs import signal as pmdata #------------ if replace_with=='trend': def replace_nans(dframe): trend = pmdata.trend(dframe, how='linear') return dframe.fillna(trend) elif replace_with=='interpolation': def replace_nans(dframe): return dframe.interpolate(method='index', limit_direction='both') #------------ original = data.copy() #------------ # If dentreded == True we save the trend for later and work with the detrended data if detrend: origtrend = pmdata.trend(data, **detrend_kw) detrended = original - origtrend dfs = algs.splitData(detrended, rule=chunk_size) else: dfs = algs.splitData(original, rule=chunk_size) #------------ max_count = int(len(original)*max_percent/100.) fault_count = pd.Series(len(original), index=original.columns) for i in range(len(dfs)): chunk=dfs[i].copy() #------------------------------- # This substitutes the spikes to NaNs so it can be replaced later if len(chunk)>max_consec_spikes: chunk=algs.limitedSubs(chunk, max_interp=max_consec_spikes, func=cut_func) fault_count = fault_count - chunk.count() #------------------------------- #------------------------------- # Substitution of spikes happens here #trend = pmdata.trend(chunk, how='linear') #chunk = chunk.fillna(trend) chunk = replace_nans(chunk) #------------------------------- #------------------------------- # We change the chunk in the original list of dfs to concatenate later dfs[i]=chunk.copy() #------------------------------- #--------------------- # Now we put the chunks back together and maybe correct the trend despiked = pd.concat(dfs) if detrend: fou = despiked + origtrend else: fou = despiked valid = fault_count < max_count #--------------------- #--------------------- # Visualize what you're doing to see if it's correct if visualize: import matplotlib.pyplot as plt print('Plotting de-spiking...') original[vis_col].plot(style='g-', label='original') fou[vis_col].plot(style='b-', label='final') plt.title('Column: {}'.format(vis_col)) plt.legend() plt.show() plt.close() #--------------------- return fou, valid, fault_count
def replace_nans(dframe): trend = pmdata.trend(dframe, how='linear') return dframe.fillna(trend)
def check_limits(data, tables, max_percent=1., replace_with='interpolation'): """ Checks dataframe for lower and upper limits. If found, they are substituted by the linear trend of the run. The number of faulty points is also checked for each column against the maximum percentage of accepted faults max_percent Parameters ---------- data: pandas dataframe dataframe to be checked tables: pandas.dataframe dataframe with the lower and upper limits for variables max_percent: float number from 0 to 100 that represents the maximum percentage of faulty runs accepted by this test. Returns ------- df: pandas.DataFrame input data but with the faulty points substituted by the linear trend of the run. valid: pandas.Series True for the columns that passed this test, False for the columns that didn't. """ from . import trend as pmtrend import numpy as np import algs import pandas as pd df = data.copy() max_count = int(len(df)*max_percent/100.) low_count = pd.Series(0, index=tables.columns) upp_count = pd.Series(0, index=tables.columns) fault_count = pd.Series(0, index=tables.columns) #----------- # First we check the lower values if 'lower_limits' in tables.index.values: faulty = df < tables.loc['lower_limits'] low_count = df[ faulty ].count() df[ faulty ] = np.nan #------------------------------- #------------------------------- # Now we check the upper values if 'upper_limits' in tables.index.values: faulty = df > tables.loc['upper_limits'] upp_count = df[ faulty ].count() df[ faulty ] = np.nan #------------------------------- fault_count = low_count + upp_count valid = fault_count < max_count #------------ # Replace data with either its trend or by interpolating if replace_with=='trend': trend = pmdata.trend(df, how='linear') df = df.fillna(trend) elif replace_with=='interpolation': df = df.interpolate(method='index', limit_direction='both') #------------ #------------------------------- # Substitute faulty points by the linear trend #trend = data.polyfit() #df = df.fillna(trend) #------------------------------- return df, valid, fault_count