def fit(self, df_devices, y=None): """ Fit the estimator. Parameters ---------- X : array-like of shape (n_samples, n_features) Data to be discretized. y : None Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline`. Returns ------- self """ if self.encode == ENC_RAW: self.data = create_raw( df_devices, t_res=self.t_res, sample_strat=self.sample_strat ) elif self.encode == ENC_CP: self.data = create_changepoint( df_devices, t_res=self.t_res ) elif self.encode == ENC_LF: self.data = create_lastfired( df_devices, t_res=self.t_res ) else: raise ValueError return self
def transform(self, df_devices=None): """ Discretize the data. Parameters ---------- X : array-like of shape (n_samples, n_features) Data to be discretized. Returns ------- Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64} Data in the binned space. Will be a sparse matrix if `self.encode='onehot'` and ndarray otherwise. """ if self.encode == 'raw': return create_raw( df_devices, t_res=self.t_res, sample_strat=self.sample_strat ) elif self.encode == 'changepoint': return create_changepoint( df_devices, t_res=self.t_res ) elif self.encode == 'lastfired': return create_lastfired( df_devices, t_res=self.t_res )
def transform(self, df_devs=None, y=None): """ Discretize the data. Parameters ---------- X : array-like of shape (n_samples, n_features) Data to be discretized. Returns ------- Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64} Data in the binned space. Will be a sparse matrix if `self.encode='onehot'` and ndarray otherwise. """ PRAEFIX_LF = 'lf_' PRAEFIX_CP = 'cp_' df_lst = [] iters = self.encode.split('+') for enc in iters: if enc == ENC_RAW: data = create_raw(df_devs) if self.t_res is not None: data = resample_raw( data, df_dev=df_devs, t_res=self.t_res, most_likely_values=self.dev_most_likely_values_) elif enc == ENC_CP: data = create_changepoint(df_devs) if self.t_res is not None: data = resample_changepoint(data, self.t_res) # add prefix to make column names unique if len(iters) > 1: data.columns = [TIME] + list( map(PRAEFIX_CP.__add__, data.columns[1:])) elif enc == ENC_LF: data = create_lastfired(df_devs) if self.t_res is not None: data = resample_last_fired(data, self.t_res) # add prefix to make column names unique if len(iters) > 1: data.columns = [TIME] + list( map(PRAEFIX_LF.__add__, data.columns[1:])) else: raise ValueError data = data.set_index(TIME) df_lst.append(data) data = pd.concat(df_lst, axis=1).reset_index() return data
def create_lagged_raw(df_dev, window_size=10, t_res=None, sample_strat='ffill'): """ create a 3D tensor of sliding windows over the raw representation. Parameters ---------- df_dev: pd.DataFrame df_act: pd.DataFrame window_size: int how much raw vectors should be considered for the creation of the 2d image t_res: String how much time intervals TODO .... Returns ------- res: np.array 3D (K-window_size x window_size x devices) res_label: np.array 1D (K-window_size) """ raw = create_raw(df_dev, t_res=t_res, sample_strat=sample_strat) raw = raw.values return _image_from_reps(raw, window_size)
def contingency_intervals(df_devs, df_acts, idle=False): """ Compute the time a device is "on" or "off" respectively during the different activities. Parameters ---------- df_devs : pd.DataFrame All recorded devices from a dataset. For more information refer to :ref:`user guide<device_dataframe>`. df_acts : pd.DataFrame All recorded activities from a dataset. Fore more information refer to the :ref:`user guide<activity_dataframe>`. idle : bool Determines whether gaps between activities should be assigned the activity *idle* or be ignored. Examples -------- >>> from pyadlml.stats import contingency_duration >>> contingency_duration(data.df_devices, data.df_activities) activity get drink ... use toilet Hall-Bedroom door Off 0 days 00:01:54 ... 0 days 00:12:24.990000 Hall-Bedroom door On 0 days 00:14:48 ... 0 days 03:02:49.984000 ... ... Washingmachine On 0 days 00:00:00 ... 0 days 00:00:00 [14 rows x 7 columns] Returns ------- df : pd.DataFrame """ TD = 'time_difference_to_succ' def func(row, raw, dev_lst): """ determines for each activity row the totol time that was spent in either on or off state for each device Parameters ---------- row : pd.Series a row of the activity dataframe contatining the start and end time for one acitivity """ # get selection of relevant devices act_start_time = row.start_time act_end_time = row.end_time raw_sel = raw[(act_start_time <= raw[TIME]) & (raw[TIME] <= act_end_time)].copy() if raw_sel.empty: # the case when no device activation fell into the recorded activity timeframe return pd.Series(index=row.index, name=row.name, dtype=row.dtype) # determine end and start time and correct for the intervals before/after # the first/last state vector s0,sn # s0 ---------I --activity --sn--------I # | ~~~tds~~~ | | ~~tde~~ | # rs as re ae # try to get the preceding state vector of devices before the activity starts idx_first = raw_sel.index[0] - 1 if idx_first == -1: # edge case when the first activity starts before the first recording # this case isn't solvable. So a heurstic that doesn't skew the statistic # to much is to assume the same state at the start of the activity raw_sel = raw_sel.append( raw_sel.iloc[0].copy()).sort_values(by=[TIME]) raw_sel.iat[0, raw_sel.columns. get_loc(TD)] = raw_sel.iloc[0].time - act_start_time else: raw_sel = raw_sel.append( raw.iloc[idx_first]).sort_values(by=[TIME]) raw_start = raw_sel.iloc[0] t_diff_start = act_start_time - raw_start.time raw_sel.at[raw_sel.iloc[0].name, TD] -= t_diff_start # set time difference for last state vector until activity ends raw_sel.at[raw_sel.iloc[-1].name, TD] = act_end_time - raw_sel.iloc[-1].time for dev in dev_lst: ser = raw_sel.groupby(by=[dev])[TD].sum() # the tries are for the cases when a device is on/off the whole time try: dev_on_time = ser.ON except AttributeError: dev_on_time = pd.Timedelta('0ns') try: dev_off_time = ser.OFF except AttributeError: dev_off_time = pd.Timedelta('0ns') row.at[ser.index.name + " On"] = dev_on_time row.at[ser.index.name + " Off"] = dev_off_time return row def create_meta(raw): devices = {name: 'object' for name in raw.columns[1:-1]} return {**{TIME: 'datetime64[ns]', 'td': 'timedelta64[ns]'}, **devices} dev_lst = df_devs[DEVICE].unique() df_devs = df_devs.sort_values(by=TIME) raw = create_raw(df_devs).applymap(lambda x: 'ON' if x else 'OFF').reset_index(drop=False) raw[TD] = raw[TIME].shift(-1) - raw[TIME] y = [(d1 + ' Off', d2 + ' On') for d1, d2 in zip(dev_lst, dev_lst)] new_cols = [d for tup in y for d in tup] df_acts = df_acts.copy().join( pd.DataFrame(index=df_acts.index, columns=new_cols)) if True: # TODO parallel is not working #if not get_parallel(): df = df_acts.apply(func, args=[raw, dev_lst], axis=1) df = df.drop(columns=[START_TIME, END_TIME]) df = df.groupby(ACTIVITY).sum() return df.T else: df = dd.from_pandas(df_acts.copy(), npartitions=get_npartitions())\ .apply(func, args=[raw, dev_lst], axis=1)\ .drop(columns=[START_TIME, END_TIME])\ .groupby(ACTIVITY).sum()\ .compute(scheduler='processes') return df.T
def create_raw(self, t_res=None, idle=False): self.df_raw = create_raw(self.df_devices, self.df_activities, t_res)
def duration_correlation(df_devs, lst_devs=None): """ Compute the similarity between devices by comparing the binary values for every interval. Parameters ---------- df_devs : pd.DataFrame All recorded devices from a dataset. For more information refer to :ref:`user guide<device_dataframe>`. lst_devs: list of str, optional A list of devices that are included in the statistic. The list can be a subset of the recorded devices or contain devices that are not recorded. Examples -------- >>> from pyadlml.stats import device_duration_corr >>> device_duration_corr(data.df_devs) device Cups cupboard Dishwasher ... Washingmachine device ... Cups cupboard 1.000000 0.997571 ... 0.999083 Dishwasher 0.997571 1.000000 ... 0.996842 ... Washingmachine 0.999083 0.996842 ... 1.000000 [14 rows x 14 columns] Returns ------- df : pd.DataFrame A dataframe of every device against another device. The values range from -1 to 1 where higher values represent more similarity. """ TD = 'td' if contains_non_binary(df_devs): df_devs, _ = split_devices_binary(df_devs) def func(row): """ gets two rows and returns a crosstab """ try: td = row.td.to_timedelta64() except: return None states = row.iloc[1:len(row) - 1].values.astype(int) K = len(states) for j in range(K): res = np.full((K), 0, dtype='timedelta64[ns]') tdiffs = states[j] * states * td row.iloc[1 + j] = tdiffs return row def create_meta(raw): devices = {name: 'object' for name in raw.columns[1:-1]} return {**{TIME: 'datetime64[ns]', TD: 'timedelta64[ns]'}, **devices} dev_lst = df_devs[DEVICE].unique() df_devs = df_devs.sort_values(by=TIME) K = len(dev_lst) # make off to -1 and on to 1 and then calculate cross correlation between signals raw = create_raw(df_devs).applymap(lambda x: 1 if x else -1).reset_index() raw[TD] = raw[TIME].shift(-1) - raw[TIME] df = dd.from_pandas(raw.copy(), npartitions=get_npartitions())\ .apply(func, axis=1).drop(columns=[TIME, TD]).sum(axis=0)\ .compute(scheduler='processes') #.apply(func, axis=1, meta=create_meta(raw)).drop(columns=['time', 'td']).sum(axis=0)\ res = pd.DataFrame(data=np.vstack(df.values), columns=df.index, index=df.index) # normalize res = res / res.iloc[0, 0] if lst_devs is not None: for dev in set(lst_devs).difference(set(list(res.index))): res[dev] = pd.NA res = res.append( pd.DataFrame(data=pd.NA, columns=res.columns, index=[dev])) return res