Пример #1
0
def label_data(df: pd.DataFrame, df_acts: pd.DataFrame, idle=False, n_jobs=1, inplace=True):
    """
    Label a dataframe with corresponding activities based on a time-index.

    Parameters
    ----------
    df : pd.DataFrame
        some data representation that possesses a column 'time' including timestamps.
    df_acts : pd.DataFrame
        a datasets activities. TODO
    idle : bool, optional, default=False
        if true this leads to datapoints not falling into a logged activity to be
        labeled as idle
    n_jobs : int, optional, default=1
        the number of jobs that are run in parallel TODO look up sklearn
    inplace : bool, optional, default=True
        determines whether a new column is appended to the existing dataframe.

    Examples
    --------
    >>> raw = DiscreteEncoder()
    >>> raw
    1 time                    0   ...      13
    2 2008-03-20 00:34:38  False  ...    True
    3 2008-03-20 00:34:39  False  ...   False

    now include
    >>> label_data(raw, data.df_activities, idle=True, n_jobs=10)
    1 time                    0   ...      13 activity
    2 2008-03-20 00:34:38  False  ...    True idle
    3 2008-03-20 00:34:39  False  ...   False act1

    Returns
    -------
    df : pd.DataFrame
    """
    df = df.copy()
    df[ACTIVITY] = -1

    if n_jobs == 1:
        df[ACTIVITY] = df[TIME].apply(
                    _map_timestamp2activity,
                    df_act=df_acts,
                    idle=idle)
    else:
        N = get_npartitions()
        if n_jobs == -1 or n_jobs > N:
            n_jobs = N

        #ddf_activities = dd.from_pandas(df_activities, npartitions=get_npartitions())
        # compute with dask in parallel
        df[ACTIVITY] = dd.from_pandas(df[TIME], npartitions=n_jobs).\
                    map_partitions( # apply lambda functions on each partition
                        lambda df: df.apply(
                            _map_timestamp2activity,
                            df_act=df_acts,
                            idle=idle)).\
                    compute(scheduler='processes')
    return df
Пример #2
0
def label_data(df_devices: pd.DataFrame, df_activities: pd.DataFrame, idle=False):
    """
    for each row in the dataframe select the corresponding activity from the
    timestamp append it as column to df_devices
    Parameters
    ----------
    df_devices : pd.DataFrame
        the only constraint is that the there is a column named time or the index named time
        an example can be raw format: 
                                0   ...      13
        Time                        ...
        2008-03-20 00:34:38  False  ...    True
        2008-03-20 00:34:39  False  ...   False
        ...
    idle : bool
        if true this leads to datapoints not falling into a logged activity to be
        labeled as idle

    Returns
    -------
        dataframe df_devices with appended label column
        Name                    0   ...      13 activity
        Time                        ...         
        2008-03-20 00:34:38  False  ...    True idle
        2008-03-20 00:34:39  False  ...   False act1
    """
    df = df_devices.copy()

    # set time as column and not as index
    if df.index.name == TIME:
        df[ACTIVITY] = df.index
        df = df.reset_index()
    else:
        df[ACTIVITY] = df[TIME].copy()
        df = df.reset_index(drop=True)

    if get_parallel():
        #ddf_activities = dd.from_pandas(df_activities, npartitions=get_npartitions())
        # compute with dask in parallel
        df[ACTIVITY] = dd.from_pandas(df[ACTIVITY], npartitions=get_npartitions()).\
                    map_partitions( # apply lambda functions on each partition
                        lambda df: df.apply(
                            _map_timestamp2activity,
                            df_act=df_activities,
                            idle=idle)).\
                    compute(scheduler='processes')
    else:
        df[ACTIVITY] = df[ACTIVITY].apply(
                            _map_timestamp2activity,
                            df_act=df_activities,
                            idle=idle)
    return df
Пример #3
0
def contingency_intervals(df_devs, df_acts, idle=False):
    """
    Compute the time a device is "on" or "off" respectively
    during the different activities.

    Parameters
    ----------
    df_devs : pd.DataFrame
        All recorded devices from a dataset. For more information refer to
        :ref:`user guide<device_dataframe>`.
    df_acts : pd.DataFrame
        All recorded activities from a dataset. Fore more information refer to the
        :ref:`user guide<activity_dataframe>`.
    idle : bool
        Determines whether gaps between activities should be assigned
        the activity *idle* or be ignored.

    Examples
    --------
    >>> from pyadlml.stats import contingency_duration
    >>> contingency_duration(data.df_devices, data.df_activities)
    activity                     get drink ...             use toilet
    Hall-Bedroom door Off  0 days 00:01:54 ... 0 days 00:12:24.990000
    Hall-Bedroom door On   0 days 00:14:48 ... 0 days 03:02:49.984000
    ...                                ...
    Washingmachine On      0 days 00:00:00 ...        0 days 00:00:00
    [14 rows x 7 columns]

    Returns
    -------
    df : pd.DataFrame
    """
    TD = 'time_difference_to_succ'

    def func(row, raw, dev_lst):
        """ determines for each activity row the totol time that was spent in either on or off state for each device
        Parameters
        ----------
        row : pd.Series
            a row of the activity dataframe contatining the start and end time for one acitivity
        """
        # get selection of relevant devices
        act_start_time = row.start_time
        act_end_time = row.end_time
        raw_sel = raw[(act_start_time <= raw[TIME])
                      & (raw[TIME] <= act_end_time)].copy()

        if raw_sel.empty:
            # the case when no device activation fell into the recorded activity timeframe
            return pd.Series(index=row.index, name=row.name, dtype=row.dtype)

        # determine end and start time and correct for the intervals before/after
        # the first/last state vector s0,sn
        #     s0 ---------I --activity --sn--------I
        #     | ~~~tds~~~ |              | ~~tde~~ |
        #    rs          as             re        ae

        # try to get the preceding state vector of devices before the activity starts
        idx_first = raw_sel.index[0] - 1
        if idx_first == -1:
            # edge case when the first activity starts before the first recording
            # this case isn't solvable. So a heurstic that doesn't skew the statistic
            # to much is to assume the same state at the start of the activity
            raw_sel = raw_sel.append(
                raw_sel.iloc[0].copy()).sort_values(by=[TIME])
            raw_sel.iat[0, raw_sel.columns.
                        get_loc(TD)] = raw_sel.iloc[0].time - act_start_time
        else:
            raw_sel = raw_sel.append(
                raw.iloc[idx_first]).sort_values(by=[TIME])
            raw_start = raw_sel.iloc[0]
            t_diff_start = act_start_time - raw_start.time
            raw_sel.at[raw_sel.iloc[0].name, TD] -= t_diff_start

        # set time difference for last state vector until activity ends
        raw_sel.at[raw_sel.iloc[-1].name,
                   TD] = act_end_time - raw_sel.iloc[-1].time

        for dev in dev_lst:
            ser = raw_sel.groupby(by=[dev])[TD].sum()
            # the tries are for the cases when a device is on/off the whole time
            try:
                dev_on_time = ser.ON
            except AttributeError:
                dev_on_time = pd.Timedelta('0ns')
            try:
                dev_off_time = ser.OFF
            except AttributeError:
                dev_off_time = pd.Timedelta('0ns')

            row.at[ser.index.name + " On"] = dev_on_time
            row.at[ser.index.name + " Off"] = dev_off_time
        return row

    def create_meta(raw):
        devices = {name: 'object' for name in raw.columns[1:-1]}
        return {**{TIME: 'datetime64[ns]', 'td': 'timedelta64[ns]'}, **devices}

    dev_lst = df_devs[DEVICE].unique()
    df_devs = df_devs.sort_values(by=TIME)
    raw = create_raw(df_devs).applymap(lambda x: 'ON'
                                       if x else 'OFF').reset_index(drop=False)
    raw[TD] = raw[TIME].shift(-1) - raw[TIME]

    y = [(d1 + ' Off', d2 + ' On') for d1, d2 in zip(dev_lst, dev_lst)]
    new_cols = [d for tup in y for d in tup]

    df_acts = df_acts.copy().join(
        pd.DataFrame(index=df_acts.index, columns=new_cols))
    if True:  # TODO parallel is not working
        #if not get_parallel():
        df = df_acts.apply(func, args=[raw, dev_lst], axis=1)
        df = df.drop(columns=[START_TIME, END_TIME])
        df = df.groupby(ACTIVITY).sum()
        return df.T
    else:
        df = dd.from_pandas(df_acts.copy(), npartitions=get_npartitions())\
                .apply(func, args=[raw, dev_lst], axis=1)\
                .drop(columns=[START_TIME, END_TIME])\
                .groupby(ACTIVITY).sum()\
                .compute(scheduler='processes')
        return df.T
Пример #4
0
def duration_correlation(df_devs, lst_devs=None):
    """
    Compute the similarity between devices by comparing the binary values
    for every interval.

    Parameters
    ----------
    df_devs : pd.DataFrame
        All recorded devices from a dataset. For more information refer to
        :ref:`user guide<device_dataframe>`.
    lst_devs: list of str, optional
        A list of devices that are included in the statistic. The list can be a
        subset of the recorded devices or contain devices that are not recorded.

    Examples
    --------
    >>> from pyadlml.stats import device_duration_corr
    >>> device_duration_corr(data.df_devs)
    device              Cups cupboard  Dishwasher  ...  Washingmachine
    device                                         ...
    Cups cupboard            1.000000    0.997571  ...        0.999083
    Dishwasher               0.997571    1.000000  ...        0.996842
    ...
    Washingmachine           0.999083    0.996842  ...        1.000000
    [14 rows x 14 columns]

    Returns
    -------
    df : pd.DataFrame
        A dataframe of every device against another device. The values range from -1 to 1
        where higher values represent more similarity.
    """
    TD = 'td'

    if contains_non_binary(df_devs):
        df_devs, _ = split_devices_binary(df_devs)

    def func(row):
        """ gets two rows and returns a crosstab
        """
        try:
            td = row.td.to_timedelta64()
        except:
            return None
        states = row.iloc[1:len(row) - 1].values.astype(int)
        K = len(states)

        for j in range(K):
            res = np.full((K), 0, dtype='timedelta64[ns]')
            tdiffs = states[j] * states * td
            row.iloc[1 + j] = tdiffs
        return row

    def create_meta(raw):
        devices = {name: 'object' for name in raw.columns[1:-1]}
        return {**{TIME: 'datetime64[ns]', TD: 'timedelta64[ns]'}, **devices}

    dev_lst = df_devs[DEVICE].unique()
    df_devs = df_devs.sort_values(by=TIME)

    K = len(dev_lst)

    # make off to -1 and on to 1 and then calculate cross correlation between signals
    raw = create_raw(df_devs).applymap(lambda x: 1 if x else -1).reset_index()
    raw[TD] = raw[TIME].shift(-1) - raw[TIME]

    df = dd.from_pandas(raw.copy(), npartitions=get_npartitions())\
                .apply(func, axis=1).drop(columns=[TIME, TD]).sum(axis=0)\
                .compute(scheduler='processes')
    #.apply(func, axis=1, meta=create_meta(raw)).drop(columns=['time', 'td']).sum(axis=0)\

    res = pd.DataFrame(data=np.vstack(df.values),
                       columns=df.index,
                       index=df.index)
    # normalize
    res = res / res.iloc[0, 0]

    if lst_devs is not None:
        for dev in set(lst_devs).difference(set(list(res.index))):
            res[dev] = pd.NA
            res = res.append(
                pd.DataFrame(data=pd.NA, columns=res.columns, index=[dev]))
    return res