示例#1
0
def mask_knans(df, x):
    """
    we specify a maximum of consecutive NaN we allow to be interpolated

    Example usage::

         df["col"] = df["col"].interpolate(method='linear').where(mask_knans(df["col"], int(15*60/300)))

    :param df: column of pandas dataframe
    :param x: maximum tolerance of NaN values

    :returns: boolean mapping of rows where we should interpolate
    """
    a = df
    x += 1
    a = np.asarray(a)
    k = a.size
    n = np.append(np.isnan(a), [False] * (x - 1))
    m = np.empty(k, np.bool8)
    m.fill(True)

    s = n.strides[0]
    i = np.where(strided(n, (k + 1 - x, x), (s, s)).all(1))[0][:, None]
    i = i + np.arange(x)
    i = pd.unique(i[i < k])

    m[i] = False

    return m
示例#2
0
def rolling_apply(df, func, period, **kwargs):
    '''
    在移动窗口中进行计算的函数
    Parameter
    ---------
    df: DataFrame
        需要进行滚动窗口计算的DataFrame
    func: function(df, **kwargs) -> value
        要求函数必须以np.array为参数传入,且返回单一一个数值结果
    period: int
        窗口长度,如果df的长度小于窗口长度,则返回值全部为np.nan,对于用于计算的数据量不够的情况,直接
        返回np.nan
    kwargs: additional parameters
        用于提供给func的其他参数

    Return
    ------
    out: Series
        移动窗口计算后的结果,数值不足填充NA,索引与原来给定的df的索引相同
    '''
    a = df.values
    s0, s1 = a.strides
    m, n = a.shape
    rolling_splited = strided(a,
                              shape=(m - period + 1, period, n),
                              strides=(s0, s0, s1),
                              writeable=False)
    out = np.array([func(rs, **kwargs) for rs in rolling_splited])
    out = pd.Series(np.concatenate((np.full((period - 1, ), np.nan), out)),
                    index=df.index)
    return out
示例#3
0
def cal_rolling_correlation(a, w):
    """cal_correlation_df"""
    n, m = a.shape[0], 2
    s1, s2 = a.strides
    b = strided(a, (m, w, n - w + 1), (s2, s1, s1))
    b_mb = b - b.mean(1, keepdims=True)
    b_ss = (b_mb**2).sum(1)**.5
    return (b_mb[0] * b_mb[1]).sum(0) / (b_ss[0] * b_ss[1])
示例#4
0
def rolling_correlation(a, w):
    # from numpy.lib.stride_tricks import as_strided as strided
    n, m = a.shape[0], 2
    s1, s2 = a.strides
    b = strided(a, (m, w, n - w + 1), (s2, s1, s1))
    b_mb = b - b.mean(1, keepdims=True)
    b_ss = (b_mb**2).sum(1)**.5
    return (b_mb[0] * b_mb[1]).sum(0) / (b_ss[0] * b_ss[1])
示例#5
0
def get_sliding_window(df, W, return2D=0):
    a = df.values
    s0, s1 = a.strides
    m, n = a.shape
    out = strided(a, shape=(m - W + 1, W, n), strides=(s0, s0, s1))
    if return2D == 1:
        return out.reshape(a.shape[0] - W + 1, -1)
    else:
        return out
def mask_knans(a, x):
    """Interpolate, then mask interpolated values if over a certain threshold.
    """
    a = np.asarray(a)
    k = a.size
    n = np.append(np.isnan(a), [False] * (x - 1))
    m = np.empty(k, np.bool8)
    m.fill(True)

    s = n.strides[0]
    i = np.where(strided(n, (k + 1 - x, x), (s, s)).all(1))[0][:, None]
    i = i + np.arange(x)
    i = pd.unique(i[i < k])

    m[i] = False
    return m
示例#7
0
def get_sliding_windows(dataFrame, windowSize, return_flatten=False):
    '''
        Arguments:
            dataFrame {Pandas DataFrame} -- dataframe to be strided
            windowSize {Int} -- sliding window size
            return_flatten {bool} -- whether to flatten rows

        Returns:
            output {Numpy ndarray} -- strided data
    '''
    stride_row, stride_col = dataFrame.values.strides
    rows, columns = dataFrame.shape
    output = strided(dataFrame, shape=(rows - windowSize + 1, windowSize, columns),
                     strides=(stride_row, stride_row, stride_col))
    if return_flatten == 1:
        return output.reshape(dataFrame.shape[0] - windowSize + 1, -1)
    else:
        return output