Exemplo n.º 1
0
 def fit(self, df_devices, y=None):
     """
     Fit the estimator.
     Parameters
     ----------
     X : array-like of shape (n_samples, n_features)
         Data to be discretized.
     y : None
         Ignored. This parameter exists only for compatibility with
         :class:`~sklearn.pipeline.Pipeline`.
     Returns
     -------
     self
     """
     if self.encode == ENC_RAW:
         self.data = create_raw(
             df_devices,
             t_res=self.t_res,
             sample_strat=self.sample_strat
         )
     elif self.encode == ENC_CP:
         self.data = create_changepoint(
             df_devices,
             t_res=self.t_res
         )
     elif self.encode == ENC_LF:
         self.data = create_lastfired(
             df_devices,
             t_res=self.t_res
         )
     else:
         raise ValueError
     return self
Exemplo n.º 2
0
 def transform(self, df_devices=None):
     """
     Discretize the data.
     Parameters
     ----------
     X : array-like of shape (n_samples, n_features)
         Data to be discretized.
     Returns
     -------
     Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
         Data in the binned space. Will be a sparse matrix if
         `self.encode='onehot'` and ndarray otherwise.
     """
     if self.encode == 'raw':
         return create_raw(
             df_devices,
             t_res=self.t_res,
             sample_strat=self.sample_strat
         )
     elif self.encode == 'changepoint':
         return create_changepoint(
             df_devices,
             t_res=self.t_res
         )
     elif self.encode == 'lastfired':
         return create_lastfired(
             df_devices,
             t_res=self.t_res
         )
Exemplo n.º 3
0
    def transform(self, df_devs=None, y=None):
        """
        Discretize the data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Data to be discretized.

        Returns
        -------
        Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
            Data in the binned space. Will be a sparse matrix if
            `self.encode='onehot'` and ndarray otherwise.
        """
        PRAEFIX_LF = 'lf_'
        PRAEFIX_CP = 'cp_'

        df_lst = []
        iters = self.encode.split('+')
        for enc in iters:
            if enc == ENC_RAW:
                data = create_raw(df_devs)
                if self.t_res is not None:
                    data = resample_raw(
                        data,
                        df_dev=df_devs,
                        t_res=self.t_res,
                        most_likely_values=self.dev_most_likely_values_)

            elif enc == ENC_CP:
                data = create_changepoint(df_devs)
                if self.t_res is not None:
                    data = resample_changepoint(data, self.t_res)

                # add prefix to make column names unique
                if len(iters) > 1:
                    data.columns = [TIME] + list(
                        map(PRAEFIX_CP.__add__, data.columns[1:]))

            elif enc == ENC_LF:
                data = create_lastfired(df_devs)
                if self.t_res is not None:
                    data = resample_last_fired(data, self.t_res)

                # add prefix to make column names unique
                if len(iters) > 1:
                    data.columns = [TIME] + list(
                        map(PRAEFIX_LF.__add__, data.columns[1:]))

            else:
                raise ValueError
            data = data.set_index(TIME)
            df_lst.append(data)

        data = pd.concat(df_lst, axis=1).reset_index()
        return data
Exemplo n.º 4
0
def create_lagged_raw(df_dev,
                      window_size=10,
                      t_res=None,
                      sample_strat='ffill'):
    """ create a 3D tensor of sliding windows over the raw representation.
    Parameters
    ----------
        df_dev: pd.DataFrame
        df_act: pd.DataFrame
        window_size: int
            how much raw vectors should be considered for the creation of the 2d image
        t_res: String
            how much  time intervals TODO ....
            
    Returns
    -------
        res: np.array 3D (K-window_size x window_size x devices)
        res_label: np.array 1D (K-window_size)
    """
    raw = create_raw(df_dev, t_res=t_res, sample_strat=sample_strat)
    raw = raw.values

    return _image_from_reps(raw, window_size)
Exemplo n.º 5
0
def contingency_intervals(df_devs, df_acts, idle=False):
    """
    Compute the time a device is "on" or "off" respectively
    during the different activities.

    Parameters
    ----------
    df_devs : pd.DataFrame
        All recorded devices from a dataset. For more information refer to
        :ref:`user guide<device_dataframe>`.
    df_acts : pd.DataFrame
        All recorded activities from a dataset. Fore more information refer to the
        :ref:`user guide<activity_dataframe>`.
    idle : bool
        Determines whether gaps between activities should be assigned
        the activity *idle* or be ignored.

    Examples
    --------
    >>> from pyadlml.stats import contingency_duration
    >>> contingency_duration(data.df_devices, data.df_activities)
    activity                     get drink ...             use toilet
    Hall-Bedroom door Off  0 days 00:01:54 ... 0 days 00:12:24.990000
    Hall-Bedroom door On   0 days 00:14:48 ... 0 days 03:02:49.984000
    ...                                ...
    Washingmachine On      0 days 00:00:00 ...        0 days 00:00:00
    [14 rows x 7 columns]

    Returns
    -------
    df : pd.DataFrame
    """
    TD = 'time_difference_to_succ'

    def func(row, raw, dev_lst):
        """ determines for each activity row the totol time that was spent in either on or off state for each device
        Parameters
        ----------
        row : pd.Series
            a row of the activity dataframe contatining the start and end time for one acitivity
        """
        # get selection of relevant devices
        act_start_time = row.start_time
        act_end_time = row.end_time
        raw_sel = raw[(act_start_time <= raw[TIME])
                      & (raw[TIME] <= act_end_time)].copy()

        if raw_sel.empty:
            # the case when no device activation fell into the recorded activity timeframe
            return pd.Series(index=row.index, name=row.name, dtype=row.dtype)

        # determine end and start time and correct for the intervals before/after
        # the first/last state vector s0,sn
        #     s0 ---------I --activity --sn--------I
        #     | ~~~tds~~~ |              | ~~tde~~ |
        #    rs          as             re        ae

        # try to get the preceding state vector of devices before the activity starts
        idx_first = raw_sel.index[0] - 1
        if idx_first == -1:
            # edge case when the first activity starts before the first recording
            # this case isn't solvable. So a heurstic that doesn't skew the statistic
            # to much is to assume the same state at the start of the activity
            raw_sel = raw_sel.append(
                raw_sel.iloc[0].copy()).sort_values(by=[TIME])
            raw_sel.iat[0, raw_sel.columns.
                        get_loc(TD)] = raw_sel.iloc[0].time - act_start_time
        else:
            raw_sel = raw_sel.append(
                raw.iloc[idx_first]).sort_values(by=[TIME])
            raw_start = raw_sel.iloc[0]
            t_diff_start = act_start_time - raw_start.time
            raw_sel.at[raw_sel.iloc[0].name, TD] -= t_diff_start

        # set time difference for last state vector until activity ends
        raw_sel.at[raw_sel.iloc[-1].name,
                   TD] = act_end_time - raw_sel.iloc[-1].time

        for dev in dev_lst:
            ser = raw_sel.groupby(by=[dev])[TD].sum()
            # the tries are for the cases when a device is on/off the whole time
            try:
                dev_on_time = ser.ON
            except AttributeError:
                dev_on_time = pd.Timedelta('0ns')
            try:
                dev_off_time = ser.OFF
            except AttributeError:
                dev_off_time = pd.Timedelta('0ns')

            row.at[ser.index.name + " On"] = dev_on_time
            row.at[ser.index.name + " Off"] = dev_off_time
        return row

    def create_meta(raw):
        devices = {name: 'object' for name in raw.columns[1:-1]}
        return {**{TIME: 'datetime64[ns]', 'td': 'timedelta64[ns]'}, **devices}

    dev_lst = df_devs[DEVICE].unique()
    df_devs = df_devs.sort_values(by=TIME)
    raw = create_raw(df_devs).applymap(lambda x: 'ON'
                                       if x else 'OFF').reset_index(drop=False)
    raw[TD] = raw[TIME].shift(-1) - raw[TIME]

    y = [(d1 + ' Off', d2 + ' On') for d1, d2 in zip(dev_lst, dev_lst)]
    new_cols = [d for tup in y for d in tup]

    df_acts = df_acts.copy().join(
        pd.DataFrame(index=df_acts.index, columns=new_cols))
    if True:  # TODO parallel is not working
        #if not get_parallel():
        df = df_acts.apply(func, args=[raw, dev_lst], axis=1)
        df = df.drop(columns=[START_TIME, END_TIME])
        df = df.groupby(ACTIVITY).sum()
        return df.T
    else:
        df = dd.from_pandas(df_acts.copy(), npartitions=get_npartitions())\
                .apply(func, args=[raw, dev_lst], axis=1)\
                .drop(columns=[START_TIME, END_TIME])\
                .groupby(ACTIVITY).sum()\
                .compute(scheduler='processes')
        return df.T
Exemplo n.º 6
0
 def create_raw(self, t_res=None, idle=False):
     self.df_raw = create_raw(self.df_devices, self.df_activities, t_res)
Exemplo n.º 7
0
def duration_correlation(df_devs, lst_devs=None):
    """
    Compute the similarity between devices by comparing the binary values
    for every interval.

    Parameters
    ----------
    df_devs : pd.DataFrame
        All recorded devices from a dataset. For more information refer to
        :ref:`user guide<device_dataframe>`.
    lst_devs: list of str, optional
        A list of devices that are included in the statistic. The list can be a
        subset of the recorded devices or contain devices that are not recorded.

    Examples
    --------
    >>> from pyadlml.stats import device_duration_corr
    >>> device_duration_corr(data.df_devs)
    device              Cups cupboard  Dishwasher  ...  Washingmachine
    device                                         ...
    Cups cupboard            1.000000    0.997571  ...        0.999083
    Dishwasher               0.997571    1.000000  ...        0.996842
    ...
    Washingmachine           0.999083    0.996842  ...        1.000000
    [14 rows x 14 columns]

    Returns
    -------
    df : pd.DataFrame
        A dataframe of every device against another device. The values range from -1 to 1
        where higher values represent more similarity.
    """
    TD = 'td'

    if contains_non_binary(df_devs):
        df_devs, _ = split_devices_binary(df_devs)

    def func(row):
        """ gets two rows and returns a crosstab
        """
        try:
            td = row.td.to_timedelta64()
        except:
            return None
        states = row.iloc[1:len(row) - 1].values.astype(int)
        K = len(states)

        for j in range(K):
            res = np.full((K), 0, dtype='timedelta64[ns]')
            tdiffs = states[j] * states * td
            row.iloc[1 + j] = tdiffs
        return row

    def create_meta(raw):
        devices = {name: 'object' for name in raw.columns[1:-1]}
        return {**{TIME: 'datetime64[ns]', TD: 'timedelta64[ns]'}, **devices}

    dev_lst = df_devs[DEVICE].unique()
    df_devs = df_devs.sort_values(by=TIME)

    K = len(dev_lst)

    # make off to -1 and on to 1 and then calculate cross correlation between signals
    raw = create_raw(df_devs).applymap(lambda x: 1 if x else -1).reset_index()
    raw[TD] = raw[TIME].shift(-1) - raw[TIME]

    df = dd.from_pandas(raw.copy(), npartitions=get_npartitions())\
                .apply(func, axis=1).drop(columns=[TIME, TD]).sum(axis=0)\
                .compute(scheduler='processes')
    #.apply(func, axis=1, meta=create_meta(raw)).drop(columns=['time', 'td']).sum(axis=0)\

    res = pd.DataFrame(data=np.vstack(df.values),
                       columns=df.index,
                       index=df.index)
    # normalize
    res = res / res.iloc[0, 0]

    if lst_devs is not None:
        for dev in set(lst_devs).difference(set(list(res.index))):
            res[dev] = pd.NA
            res = res.append(
                pd.DataFrame(data=pd.NA, columns=res.columns, index=[dev]))
    return res