def mask_knans(df, x): """ we specify a maximum of consecutive NaN we allow to be interpolated Example usage:: df["col"] = df["col"].interpolate(method='linear').where(mask_knans(df["col"], int(15*60/300))) :param df: column of pandas dataframe :param x: maximum tolerance of NaN values :returns: boolean mapping of rows where we should interpolate """ a = df x += 1 a = np.asarray(a) k = a.size n = np.append(np.isnan(a), [False] * (x - 1)) m = np.empty(k, np.bool8) m.fill(True) s = n.strides[0] i = np.where(strided(n, (k + 1 - x, x), (s, s)).all(1))[0][:, None] i = i + np.arange(x) i = pd.unique(i[i < k]) m[i] = False return m
def rolling_apply(df, func, period, **kwargs): ''' 在移动窗口中进行计算的函数 Parameter --------- df: DataFrame 需要进行滚动窗口计算的DataFrame func: function(df, **kwargs) -> value 要求函数必须以np.array为参数传入,且返回单一一个数值结果 period: int 窗口长度,如果df的长度小于窗口长度,则返回值全部为np.nan,对于用于计算的数据量不够的情况,直接 返回np.nan kwargs: additional parameters 用于提供给func的其他参数 Return ------ out: Series 移动窗口计算后的结果,数值不足填充NA,索引与原来给定的df的索引相同 ''' a = df.values s0, s1 = a.strides m, n = a.shape rolling_splited = strided(a, shape=(m - period + 1, period, n), strides=(s0, s0, s1), writeable=False) out = np.array([func(rs, **kwargs) for rs in rolling_splited]) out = pd.Series(np.concatenate((np.full((period - 1, ), np.nan), out)), index=df.index) return out
def cal_rolling_correlation(a, w): """cal_correlation_df""" n, m = a.shape[0], 2 s1, s2 = a.strides b = strided(a, (m, w, n - w + 1), (s2, s1, s1)) b_mb = b - b.mean(1, keepdims=True) b_ss = (b_mb**2).sum(1)**.5 return (b_mb[0] * b_mb[1]).sum(0) / (b_ss[0] * b_ss[1])
def rolling_correlation(a, w): # from numpy.lib.stride_tricks import as_strided as strided n, m = a.shape[0], 2 s1, s2 = a.strides b = strided(a, (m, w, n - w + 1), (s2, s1, s1)) b_mb = b - b.mean(1, keepdims=True) b_ss = (b_mb**2).sum(1)**.5 return (b_mb[0] * b_mb[1]).sum(0) / (b_ss[0] * b_ss[1])
def get_sliding_window(df, W, return2D=0): a = df.values s0, s1 = a.strides m, n = a.shape out = strided(a, shape=(m - W + 1, W, n), strides=(s0, s0, s1)) if return2D == 1: return out.reshape(a.shape[0] - W + 1, -1) else: return out
def mask_knans(a, x): """Interpolate, then mask interpolated values if over a certain threshold. """ a = np.asarray(a) k = a.size n = np.append(np.isnan(a), [False] * (x - 1)) m = np.empty(k, np.bool8) m.fill(True) s = n.strides[0] i = np.where(strided(n, (k + 1 - x, x), (s, s)).all(1))[0][:, None] i = i + np.arange(x) i = pd.unique(i[i < k]) m[i] = False return m
def get_sliding_windows(dataFrame, windowSize, return_flatten=False): ''' Arguments: dataFrame {Pandas DataFrame} -- dataframe to be strided windowSize {Int} -- sliding window size return_flatten {bool} -- whether to flatten rows Returns: output {Numpy ndarray} -- strided data ''' stride_row, stride_col = dataFrame.values.strides rows, columns = dataFrame.shape output = strided(dataFrame, shape=(rows - windowSize + 1, windowSize, columns), strides=(stride_row, stride_row, stride_col)) if return_flatten == 1: return output.reshape(dataFrame.shape[0] - windowSize + 1, -1) else: return output