Пример #1
0
def devices_td_on(df_devs):
    """
    Compute the amount of time a device was in the "on" state for each datapoint.

    Parameters
    ----------
    df_devs : pd.DataFrame
        All recorded devices from a dataset. For more information refer to
        :ref:`user guide<device_dataframe>`.

    Examples
    --------
    >>> from pyadlml.stats import device_on_time
    >>> device_on_time(data.df_devs)
                      device              td
    0      Hall-Bedroom door 0 days 00:02:43
    1      Hall-Bedroom door 0 days 00:00:01
    ...                  ...             ...
    1309           Frontdoor 0 days 00:00:01
    [1310 rows x 2 columns]

    Returns
    -------
    df : pd.DataFrame
        A dataframe with two columns, the devices and the time differences.
    """
    time_difference = 'td'
    df = df_devs.copy()
    if contains_non_binary(df):
        df, _ = split_devices_binary(df)

    if not _is_dev_rep2(df):
        df, _ = device_rep1_2_rep2(df.copy(), drop=False)
    df[time_difference] = df[END_TIME] - df[START_TIME]
    return df[[DEVICE, time_difference]]
Пример #2
0
def devices_on_off_stats(df_devs, lst_devs=None):
    """
    Calculate the time and proportion a device was in the "on"
    versus the "off" state.

    Parameters
    ----------
    df_devs : pd.DataFrame
        A datasets device dataframe. The columns are ['time', 'device', 'val'].
    lst_devs : list, optional
        An optional list of all device names. Use this if there exist devices
        that are not present in the recorded dataset but should be included in the statistic.

    Examples
    --------
    >>> from pyadlml.stats import device_on_off
    >>> device_on_off(data.df_devs)
                    device                  td_on                  td_off   frac_on  frac_off
    0        Cups cupboard 0 days 00:10:13.010000 27 days 18:34:19.990000  0.000255  0.999745
    1           Dishwasher        0 days 00:55:02        27 days 17:49:31  0.001376  0.998624
    ...                ...                    ...                     ...        ...      ...
    13      Washingmachine        0 days 00:08:08        27 days 18:36:25  0.000203  0.999797

    Returns
    -------
    df : pd.DataFrame
    """

    diff = 'diff'
    td_on = 'td_on'
    td_off = 'td_off'
    frac_on = 'frac_on'
    frac_off = 'frac_off'

    if contains_non_binary(df_devs):
        df_devs, _ = split_devices_binary(df_devs)

    if not _is_dev_rep2(df_devs):
        df_devs, _ = device_rep1_2_rep2(df_devs.copy(), drop=False)
    df_devs = df_devs.sort_values(START_TIME)

    # calculate total time interval for normalization
    int_start = df_devs.iloc[0, 0]
    int_end = df_devs.iloc[df_devs.shape[0] - 1, 1]
    norm = int_end - int_start

    # calculate time deltas for online time
    df_devs[diff] = df_devs[END_TIME] - df_devs[START_TIME]
    df_devs = df_devs.groupby(DEVICE)[diff].sum()
    df_devs = pd.DataFrame(df_devs)
    df_devs.columns = [td_on]

    df_devs[td_off] = norm - df_devs[td_on]

    # compute percentage
    df_devs[frac_on] = df_devs[td_on].dt.total_seconds() \
                       / norm.total_seconds()
    df_devs[frac_off] = df_devs[td_off].dt.total_seconds() \
                        / norm.total_seconds()
    if lst_devs is not None:
        for dev in set(lst_devs).difference(set(list(df_devs.index))):
            df_devs = df_devs.append(
                pd.DataFrame(data=[[pd.NaT, pd.NaT, pd.NA, pd.NA]],
                             columns=df_devs.columns,
                             index=[dev]))
    return df_devs.reset_index()\
        .rename(columns={'index':DEVICE})\
        .sort_values(by=[DEVICE])
Пример #3
0
def duration_correlation(df_devs, lst_devs=None):
    """
    Compute the similarity between devices by comparing the binary values
    for every interval.

    Parameters
    ----------
    df_devs : pd.DataFrame
        All recorded devices from a dataset. For more information refer to
        :ref:`user guide<device_dataframe>`.
    lst_devs: list of str, optional
        A list of devices that are included in the statistic. The list can be a
        subset of the recorded devices or contain devices that are not recorded.

    Examples
    --------
    >>> from pyadlml.stats import device_duration_corr
    >>> device_duration_corr(data.df_devs)
    device              Cups cupboard  Dishwasher  ...  Washingmachine
    device                                         ...
    Cups cupboard            1.000000    0.997571  ...        0.999083
    Dishwasher               0.997571    1.000000  ...        0.996842
    ...
    Washingmachine           0.999083    0.996842  ...        1.000000
    [14 rows x 14 columns]

    Returns
    -------
    df : pd.DataFrame
        A dataframe of every device against another device. The values range from -1 to 1
        where higher values represent more similarity.
    """
    TD = 'td'

    if contains_non_binary(df_devs):
        df_devs, _ = split_devices_binary(df_devs)

    def func(row):
        """ gets two rows and returns a crosstab
        """
        try:
            td = row.td.to_timedelta64()
        except:
            return None
        states = row.iloc[1:len(row) - 1].values.astype(int)
        K = len(states)

        for j in range(K):
            res = np.full((K), 0, dtype='timedelta64[ns]')
            tdiffs = states[j] * states * td
            row.iloc[1 + j] = tdiffs
        return row

    def create_meta(raw):
        devices = {name: 'object' for name in raw.columns[1:-1]}
        return {**{TIME: 'datetime64[ns]', TD: 'timedelta64[ns]'}, **devices}

    dev_lst = df_devs[DEVICE].unique()
    df_devs = df_devs.sort_values(by=TIME)

    K = len(dev_lst)

    # make off to -1 and on to 1 and then calculate cross correlation between signals
    raw = create_raw(df_devs).applymap(lambda x: 1 if x else -1).reset_index()
    raw[TD] = raw[TIME].shift(-1) - raw[TIME]

    df = dd.from_pandas(raw.copy(), npartitions=get_npartitions())\
                .apply(func, axis=1).drop(columns=[TIME, TD]).sum(axis=0)\
                .compute(scheduler='processes')
    #.apply(func, axis=1, meta=create_meta(raw)).drop(columns=['time', 'td']).sum(axis=0)\

    res = pd.DataFrame(data=np.vstack(df.values),
                       columns=df.index,
                       index=df.index)
    # normalize
    res = res / res.iloc[0, 0]

    if lst_devs is not None:
        for dev in set(lst_devs).difference(set(list(res.index))):
            res[dev] = pd.NA
            res = res.append(
                pd.DataFrame(data=pd.NA, columns=res.columns, index=[dev]))
    return res