Пример #1
0
def _apply_raw(df):
    """
        df: 
        | Start time    | End time  | device_name 
        ------------------------------------------
        | ts1           | ts2       | name1       

        return df:
        | time  | dev_1 | ....  | dev_n |
        --------------------------------
        | ts1   |   1   | ....  |  0    |
    """
    # change to rep3
    df_dev = device_rep1_2_rep3(df)
    dev_lst = df_dev[DEVICE].unique()

    # create raw dataframe
    df_res = _create_devices(dev_lst, index=df_dev[TIME])

    # create first row in dataframe
    df_res.iloc[0] = np.zeros(len(dev_lst))
    col_idx = np.where(dev_lst == df_dev.iloc[0].device)[0][0]
    df_res.iloc[0, col_idx] = 1

    # update all rows of the dataframe
    for i, row in enumerate(df_dev.iterrows()):
        if i == 0: continue

        #copy previous row into current and update current value
        df_res.iloc[i] = df_res.iloc[i - 1].values
        col_idx = np.where(dev_lst == df_dev.iloc[i].device)[0][0]
        df_res.iloc[i, col_idx] = int(df_dev.iloc[i].val)

    return df_res
Пример #2
0
def duration_correlation(df):
    """ compute the crosscorelation by comparing for every interval the binary values
    between the devices
    
    Parameters
    ----------
        df_dev: pd.DataFrame
            device representation 1 
    returns
    -------
        pd.DataFrame (k x k)
        crosscorrelation between each device
    """
    df_dev = device_rep1_2_rep3(df)

    dev_lst = df_dev['device'].unique()
    df_dev = df_dev.sort_values(by='time')

    K = len(dev_lst)
    crosstab = np.full((K, K), 0, dtype='timedelta64[ns]')

    # make off to -1 and on to 1 and then calculate cross correlation between signals

    states = np.full(K, -1)
    prae_time = df_dev.iloc[0].time
    dev_idx = np.where(dev_lst == df_dev.iloc[0].device)[0][0]
    states[dev_idx] = 1

    # sweep through all
    i = 0
    for row in df_dev.iterrows():
        if i == 0:
            i += 1
            continue

        # for every device determine cross correlation by multiplying
        # the state of the device with the vector of states in order
        # to know if to subtract or add the time in the previous interval
        nt = row[1].time
        td = (nt - prae_time).to_timedelta64()
        for j in range(K):
            dev_st = states[j]
            tdiffs = dev_st * states * td
            crosstab[j, :] = crosstab[j, :] + tdiffs

        # update state array with new state and set new time
        dev_idx = np.where(dev_lst == row[1].device)[0][0]
        if row[1].val:
            states[dev_idx] = 1
        else:
            states[dev_idx] = -1
        prae_time = nt

    # normalize by the whole time. Diagonal contains full timeframe
    crosstab = crosstab / crosstab[0, 0]
    ct = pd.DataFrame(data=crosstab, index=dev_lst, columns=dev_lst)
    return ct
Пример #3
0
def create_raw(df_devices, t_res=None, sample_strat='ffill', idle=False):
    dev = df_devices.copy()
    raw = _apply_raw(dev)
    dev = device_rep1_2_rep3(dev)

    if t_res is not None:
        raw = _resample_df(raw, t_res, dev=dev, sample_strat=sample_strat)

    return raw
Пример #4
0
def create_changepoint(df_devices, t_res=None, idle=False):
    dev = df_devices.copy()
    dev = device_rep1_2_rep3(dev)
    cp = _apply_changepoint(dev)

    if t_res is not None:
        resampler = cp.resample(t_res, kind='timestamp')
        cp = resampler.apply(_cp_evaluator, dev=dev)

    return cp
Пример #5
0
def create_lastfired(df_devices, t_res=None):
    dev = df_devices.copy()
    dev = device_rep1_2_rep3(dev)
    lf = _apply_changepoint(dev)

    if t_res is not None:
        resampler = lf.resample(t_res, kind='timestamp')
        lf = resampler.apply(_lf_evaluator, df=lf.copy())
        lf = lf.fillna(method='ffill')

    return lf
Пример #6
0
def device_tcorr(df, t_windows=['20s']):
    """ computes for every time window the prevalence of device triggers
        for each device

    Parameters
    ----------
    df : pd.DataFrame
        device representation 1
    t_windows : list
        time frames or a single window (string)

    Returns 
    -------
    lst : list of panda dataframes
    """

    t_windows = timestr_2_timedeltas(t_windows)

    df = device_rep1_2_rep3(df)

    # create timediff to the previous trigger
    df['time_diff'] = df['time'].diff()

    #knn
    #    do cumsum for row_duration
    #    for each row mask the rows that fall into the given area
    dev_list = df.device.unique()

    df.iloc[0, 3] = pd.Timedelta(0, 's')
    df['cum_sum'] = df['time_diff'].cumsum()

    lst = []
    for t_window in t_windows:
        # create cross table with zeros
        res_df = pd.DataFrame(columns=dev_list, index=dev_list)
        for col in res_df.columns:
            res_df[col].values[:] = 0

        # this whole iterations can be done in parallel
        for row in df.iterrows():
            td = row[1].cum_sum
            dev_name = row[1].device

            df['tmp'] = (td - t_window < df['cum_sum']) & (df['cum_sum'] <
                                                           td + t_window)
            tmp = df.groupby('device')['tmp'].sum()
            res_df.loc[dev_name] += tmp
        lst.append(res_df)
    return lst
Пример #7
0
    def _create_index(self, df_devices, t_res):
        """
        create the dummy dataframe for the index from the devices
        index | val
        """
        df = device_rep1_2_rep3(df_devices.copy())
        df = df.pivot(index='time', columns='device', values='val').iloc[:, :1]
        df = df.astype(bool)  # just to have a lower memory footprint

        # resample with frequency
        resampler = df.resample(t_res, kind='timestamp')
        df_index = resampler.sum()
        df_index.columns = ['val']
        df_index['val'] = 1
        return df_index
Пример #8
0
def device_triggers_one_day(df, t_res='1h'):
    """
    computes the amount of triggers of a device for each hour of a day summed
    over all the weeks

    params: df: pd.DataFrame
                repr2 of devices
            t_res: [0,24]h or [0,60]m for a resoltion in hours, minutes
    returns: df
            index: hours
            columsn devices
            values: the amount a device changed states 
    """

    df = device_rep1_2_rep3(df)

    # compute new table
    df['time'] = df['time'].apply(time2int, args=[t_res])
    df = df.groupby(['time', 'device']).sum().unstack()
    df = df.fillna(0)
    df.columns = df.columns.droplevel(0)
    return df