def _find_sections_with_no_target(self): """Finds the intersections of the mains good sections with the gaps between target appliance activations. """ self.sections_with_no_target = {} seq_length_secs = self.seq_length * self.sample_period for fold, sects_per_building in self.mains_good_sections.items(): for building, good_sections in sects_per_building.items(): activations = ( self.activations[fold][self.target_appliance][building]) mains = self.mains[fold][building] mains_good_sections = self.mains_good_sections[fold][building] gaps_between_activations = TimeFrameGroup() prev_end = mains.index[0] for activation in activations: gap = TimeFrame(prev_end, activation.index[0]) gaps_between_activations.append(gap) prev_end = activation.index[-1] gap = TimeFrame(prev_end, mains.index[-1]) gaps_between_activations.append(gap) intersection = ( gaps_between_activations.intersection(mains_good_sections)) intersection = intersection.remove_shorter_than( seq_length_secs) self.sections_with_no_target.setdefault( fold, {})[building] = (intersection) logger.info( "Found {} sections without target for {} {}.".format( len(intersection), fold, building))
def _delete_phony_sections(self): filtered_data = {} for fold, data_per_building in self.data.items(): for building, data in data_per_building.items(): if building not in self.phony_active_timeframes[fold][self.target_appliance]: continue activations = ( self.phony_active_timeframes[fold][self.target_appliance][building]) data_between_phony_activations = TimeFrameGroup() prev_end = data.index[0] for activation in activations: activation_start = activation.start if prev_end < activation_start: gap = TimeFrame(prev_end, activation_start) data_between_phony_activations.append(gap) prev_end = activation.end data_end = data.index[-1] + pd.Timedelta(seconds=self.sample_period) if prev_end < data_end: gap = TimeFrame(prev_end, data_end) data_between_phony_activations.append(gap) dfs = [] for section in data_between_phony_activations: dfs.append(section.slice(data)) data = pd.concat(dfs) filtered_data.setdefault(fold, {})[building] = ( data) logger.info("Found {} good sections for {} {}." .format(len(data_between_phony_activations), fold, building)) self.data = filtered_data
def _find_sections_with_no_target(self): """Finds the intersections of the mains good sections with the gaps between target appliance activations. """ self.sections_with_no_target = {} seq_length_secs = self.seq_length * self.sample_period for fold, sects_per_building in self.data_good_sections.items(): for building, good_sections in sects_per_building.items(): if building not in self.all_activations[fold][self.target_appliance]: continue activations = ( self.all_activations[fold][self.target_appliance][building]) data = self.data[fold][building] data_good_sections = good_sections gaps_between_activations = TimeFrameGroup() prev_end = data.index[0] for activation in activations: activation_start = activation.start if prev_end < activation_start: gap = TimeFrame(prev_end, activation_start) gaps_between_activations.append(gap) prev_end = activation.end data_end = data.index[-1] if prev_end < data_end: gap = TimeFrame(prev_end, data_end) gaps_between_activations.append(gap) intersection = ( gaps_between_activations.intersection(data_good_sections)) intersection = intersection.remove_shorter_than( seq_length_secs) self.sections_with_no_target.setdefault(fold, {})[building] = ( intersection) logger.info("Found {} sections without target for {} {}." .format(len(intersection), fold, building))
def import_from_cache(self, cached_stat, sections): # we (deliberately) use duplicate indices to cache GoodSectionResults grouped_by_index = cached_stat.groupby(level=0) tz = get_tz(cached_stat) for tf_start, df_grouped_by_index in grouped_by_index: grouped_by_end = df_grouped_by_index.groupby('end') for tf_end, sections_df in grouped_by_end: end = tz_localize_naive(tf_end, tz) timeframe = TimeFrame(tf_start, end) if timeframe in sections: timeframes = [] for _, row in sections_df.iterrows(): #~ print('Computing sections...') #~ print('\n=== type(row)\n%s\n' % (type(row))) #~ print('\n=== row\n%s\n' % (row)) #~ print('\n=== dir(row)\n%s\n' % (dir(row))) #~ print('\n=== row.__dict__\n%s\n' % (row.__dict__)) #~ print('\n=== row.index\n%s\n' % (row.index)) #~ print('\n=== row.index.__dict__\n%s\n' % (row.index.__dict__)) #~ print('\n=== dir(row.index)\n%s\n' % (dir(row.index))) #~ print('\n=== row.index[2]\n%s\n' % (row.index[2])) #~ print('\n=== row.iloc[1]\n%s\n' % (row.iloc[1])) #~ print('\n=== row.iloc[2]\n%s\n' % (row.iloc[2])) section_start = tz_localize_naive( row.iloc[2], tz) # row['section_start'] section_end = tz_localize_naive( row.iloc[1], tz) # row['section_end'] timeframes.append(TimeFrame(section_start, section_end)) self.append(timeframe, {'sections': [timeframes]})
def load(self, key, columns=None, sections=None, n_look_ahead_rows=0, chunksize=MAX_MEM_ALLOWANCE_IN_BYTES): file_path = self._key_to_abs_path(key) # Set `sections` variable sections = [TimeFrame()] if sections is None else sections sections = TimeFrameGroup(sections) self.all_sections_smaller_than_chunksize = True # iterate through parameter sections # requires 1 pass through file for each section for section in sections: window_intersect = self.window.intersection(section) header_rows = [0,1] text_file_reader = pd.read_csv(file_path, index_col=0, header=header_rows, parse_dates=True, chunksize=chunksize) # iterate through all chunks in file for chunk_idx, chunk in enumerate(text_file_reader): # filter dataframe by specified columns if columns: chunk = chunk[columns] # mask chunk by window and section intersect subchunk_idx = [True]*len(chunk) if window_intersect.start: subchunk_idx = np.logical_and(subchunk_idx, (chunk.index>=window_intersect.start)) if window_intersect.end: subchunk_idx = np.logical_and(subchunk_idx, (chunk.index<window_intersect.end)) if window_intersect.empty: subchunk_idx = [False]*len(chunk) subchunk = chunk[subchunk_idx] if len(subchunk)>0: subchunk_end = np.max(np.nonzero(subchunk_idx)) subchunk.timeframe = TimeFrame(subchunk.index[0], subchunk.index[-1]) # Load look ahead if necessary if n_look_ahead_rows > 0: if len(subchunk.index) > 0: rows_to_skip = (len(header_rows)+1)+(chunk_idx*chunksize)+subchunk_end+1 try: subchunk.look_ahead = pd.read_csv(file_path, index_col=0, header=None, parse_dates=True, skiprows=rows_to_skip, nrows=n_look_ahead_rows) except ValueError: subchunk.look_ahead = pd.DataFrame() else: subchunk.look_ahead = pd.DataFrame() yield subchunk
def test_u(ts1, ts2, ts3, ts4): ts1 = pd.Timestamp(ts1) ts2 = pd.Timestamp(ts2) ts3 = pd.Timestamp(ts3) ts4 = pd.Timestamp(ts4) tf1 = TimeFrame(ts1, ts2) tf2 = TimeFrame(ts3, ts4) union = tf1.union(tf2) self.assertEqual(union.start, ts1) self.assertEqual(union.end, ts4)
def get_timeframe(self): ''' Returns the timeframe from start of first section to end of last section. Returns: timeframe: outer timeframe of this TimeFrameGroup ''' if self._df.empty: return TimeFrame(start=None, end=None) idx = self._df.index return TimeFrame(start=self._df.loc[idx[0], 'section_start'], end=self._df.loc[idx[-1], 'section_end'])
def test_merge_timeframes(self): tfs = [ TimeFrame("2010-01-01", "2011-01-01"), TimeFrame("2011-01-01", "2011-06-01"), TimeFrame("2012-01-01", "2013-01-01") ] merged = merge_timeframes(tfs) correct_answer = [ TimeFrame("2010-01-01", "2011-06-01"), TimeFrame("2012-01-01", "2013-01-01") ] self.assertEqual(merged, correct_answer)
def __init__(self, timeframes=None): if isinstance(timeframes, pd.tseries.period.PeriodIndex): periods = timeframes timeframes = [TimeFrame(period.start_time, period.end_time) for period in periods] args = [timeframes] if timeframes else [] super(TimeFrameGroup, self).__init__(*args)
def import_from_cache(self, cached_stat, sections): # we (deliberately) use duplicate indices to cache GoodSectionResults grouped_by_index = cached_stat.groupby(level=0) tz = get_tz(cached_stat) for name, group in grouped_by_index: assert group['end'].unique().size == 1 end = tz_localize_naive(group['end'].iloc[0], tz) timeframe = TimeFrame(name, end) if timeframe in sections: timeframes = [] for _, row in group.iterrows(): section_start = tz_localize_naive(row['section_start'], tz) section_end = tz_localize_naive(row['section_end'], tz) timeframes.append(TimeFrame(section_start, section_end)) self.append(timeframe, {'sections': [timeframes]})
def __init__(self): """ Parameters ---------- filename : string """ self.window = TimeFrame()
def _add_external_data(self, chunk, ext_dataset, external_features, horizon=None): ''' Currently coming from 820 (for all the meters I do consider) Paramters --------- chunk: pd.DataFrame, pd.DatetimeIndex The input which have to be augmented ext_dataset: nilmtk.Dataset The Dataset, where the external Data can be found. horizon: nilmtk.Timedelta The timeframe in the future for which external data shall be retrieved. This will be outside the chunk area sothat it is extended. Necessary for forecast with external data included. external_features: [indixes,... ] The indexes which shall be retrieved. Returns ------- chunk: pd.DataFrame The input chunk extended by the features given in external_features. ''' if not horizon is None and not type(horizon) is pd.Timedelta: raise Exception("Horizon has to be a DatetimeDelta") if not external_features is None and not type( external_features) is list: external_features = [external_features] # Sothat index is also supported if type(chunk) is pd.DatetimeIndex: chunk = pd.DataFrame(index=chunk) extData = None if len(external_features) > 0: section = TimeFrame(start=chunk.index[0], end=chunk.index[-1]) if not horizon is None: section.end = section.end + horizon extData = ext_dataset.get_data_for_group('820', section, 60 * 15, external_features)[1:] return pd.concat([chunk, extData], axis=1)
def __iter__(self): ''' Enabled an iterator to iterate the TimeframeGroup ''' if len(self._df) == 0: return iter([]) else: for i, row in self._df.iterrows(): yield TimeFrame(start=row['section_start'], end=row['section_end'])
def import_from_cache(self, cached_stat, sections): # we (deliberately) use duplicate indices to cache GoodSectionResults grouped_by_index = cached_stat.groupby(level=0) tz = get_tz(cached_stat) for tf_start, df_grouped_by_index in grouped_by_index: grouped_by_end = df_grouped_by_index.groupby('end') for tf_end, sections_df in grouped_by_end: end = tz_localize_naive(tf_end, tz) timeframe = TimeFrame(tf_start, end) if timeframe in sections: timeframes = [] for _, row in sections_df.iterrows(): section_start = tz_localize_naive( row['section_start'], tz) section_end = tz_localize_naive(row['section_end'], tz) timeframes.append(TimeFrame(section_start, section_end)) self.append(timeframe, {'sections': [timeframes]})
def get_timeframe(self, key): """ Returns ------- nilmtk.TimeFrame of entire table after intersecting with self.window. """ data_start_date = self.store.select(key, [0]).index[0] data_end_date = self.store.select(key, start=-1).index[0] timeframe = TimeFrame(data_start_date, data_end_date) return self.window.intersection(timeframe)
def test_date_setting(self): TimeFrame() TimeFrame("2012-01-01", "2013-01-01") # test identical start and end dates with self.assertRaises(ValueError): TimeFrame("2012-01-01", "2012-01-01") TimeFrame(start="2011-01-01") TimeFrame(end="2011-01-01") # test end date after start date with self.assertRaises(ValueError): TimeFrame("2012-01-01", "2011-01-01") tf = TimeFrame() tf.end = "2011-01-01" tf.start = "2010-01-01" with self.assertRaises(ValueError): tf.start = "2012-01-01"
def _classify_activation_quality(self, nilmtk_activations): def get_stale_seconds(act): actdiff = act.resample("{:d}S".format(self.sample_period)).mean().ffill().diff() return (actdiff == 0.0).sum() * self.sample_period def activation_filter(tf, building_data): start_time = tf.start end_time = tf.end df = building_data[start_time:end_time] if df.empty: return False else: act_stale_seconds = get_stale_seconds(df['target']) act_duration = (end_time - start_time).total_seconds() act_stale_pct = act_stale_seconds / act_duration mains_stale_seconds = get_stale_seconds(df['mains']) mains_stale_pct = get_stale_seconds(df['mains']) / act_duration if (act_stale_pct < self.activation_max_stale_pct) & (mains_stale_pct < self.mains_max_stale_pct): return True else: return False good_timeframes = {} bad_timeframes = {} all_timeframes = {} for fold, buildings_per_appliances in nilmtk_activations.items(): good_timeframes[fold] = {} bad_timeframes[fold] = {} all_timeframes[fold] = {} for appliance, activations_per_building in buildings_per_appliances.items(): good_timeframes[fold][appliance] = {} bad_timeframes[fold][appliance] = {} all_timeframes[fold][appliance] = {} for building, activations in activations_per_building.items(): building_data = self.data[fold][building] good_timeframes_per_building = TimeFrameGroup() bad_timeframes_per_building = TimeFrameGroup() all_timeframes_per_building = TimeFrameGroup() for i, activation in enumerate(activations): tf = TimeFrame( start=activation.index[0], end=activation.index[-1] + pd.Timedelta(seconds=self.sample_period)) all_timeframes_per_building.append(tf) if activation_filter(tf, building_data): good_timeframes_per_building.append(tf) else: bad_timeframes_per_building.append(tf) good_timeframes[fold][appliance][building] = good_timeframes_per_building bad_timeframes[fold][appliance][building] = bad_timeframes_per_building all_timeframes[fold][appliance][building] = all_timeframes_per_building # self.clean_active_timeframes = good_timeframes self.all_active_timeframes = all_timeframes self.phony_active_timeframes = bad_timeframes
def pop(self, i): ''' Pops a certain TimeFrame from the TimeFrameGroup The TimeFrame at position i is removed from the Group and returned Paramters --------- i: int The location of the event to remove. ''' if i is None: i = -1 last = self._df.iloc[i, :] self._df.drop(self._df.index[i], inplace=True) return TimeFrame(last['section_start'], last['section_end'])
def import_from_cache(self, cached_stat, sections): ''' As explained in 'export_to_cache' the sections have to be stored rowwise. This function parses the lines and rearranges them as a proper AboveFreqSectionsResult again. ''' # we (deliberately) use duplicate indices to cache AboveFreqSectionResults grouped_by_index = cached_stat.groupby(level=0) tz = get_tz(cached_stat) for tf_start, df_grouped_by_index in grouped_by_index: grouped_by_end = df_grouped_by_index.groupby('end') for tf_end, sections_df in grouped_by_end: end = tz_localize_naive(tf_end, tz) timeframe = TimeFrame(tf_start, end) if timeframe in sections: timeframes = [] for _, row in sections_df.iterrows(): section_start = tz_localize_naive( row['section_start'], tz) section_end = tz_localize_naive(row['section_end'], tz) timeframes.append(TimeFrame(section_start, section_end)) self.append(timeframe, {'sections': [timeframes]})
def __getitem__(self, i): ''' Enabled to access TimeFrameGroup as a list Parameters ---------- i: int Position to return Results ------- elements: nilmtk.TimeFrame The element at position i ''' elements = self._df.iloc[i, :] return TimeFrame(elements['section_start'], elements['section_end'])
def get_timeframe(self, key): file_path = self._key_to_abs_path(key) text_file_reader = pd.read_csv(file_path, index_col=0, header=[0, 1], parse_dates=True, chunksize=MAX_MEM_ALLOWANCE_IN_BYTES) start = None end = None for df in text_file_reader: if start is None: start = df.index[0] end = df.index[-1] timeframe = TimeFrame(start, end) return self.window.intersect(timeframe)
def get_timeframe(self, key): """ The key is immediatly in the correct form, required to do the request. It has the form: {lat}/{lng}/{deviceKey}/{deviceType} Returns ------- nilmtk.TimeFrame of entire table after intersecting with self.window. """ timeframe = self._execute_request("get_timeframe", type="GET", parameters={"url": key}) start = pd.Timestamp(timeframe[0]) end = pd.Timestamp(timeframe[1]) timeframe = TimeFrame(start, end) return timeframe
def invert(self, start=None, end=None): ''' Returns a TimeFrameGroup with inverted rectangles. That means where there was a gap before is now a TimeFrame and vice versa. Paramter -------- start, end: pd.TimeStamp Defining the start and end of the region to invert. Returns ------- Inversion: pd.TimeFrameGroup The inverted timeframegroup, with the section beeing the gaps and the other ways arround. ''' if self._df.empty: if not start is None and not end is None: return TimeFrameGroup([TimeFrame(start=start, end=end)]) return TimeFrameGroup() inversion = self._df.copy() if self._df.iloc[-1, :]["section_end"] < end: val_to_append = self._df.iloc[-1, :]["section_start"] inversion['section_end'] = inversion['section_end'].shift(1) row = len(inversion) inversion.loc[row, :] = [start, start] inversion.loc[row, 'section_start'] = end inversion.loc[row, 'section_end'] = val_to_append else: inversion['section_end'] = inversion['section_end'].shift(1) if not start is None and start < self._df.iloc[-1, :]['section_start']: inversion.loc[0, 'section_end'] = start inversion = inversion.dropna().rename(columns={ "section_end": "section_start", "section_start": "section_end" }) if not start is None and inversion.loc[0, 'section_start'] < start: inversion.loc[0, 'section_start'] = start if not end is None and inversion.loc[inversion.index[-1], 'section_end'] > end: inversion.loc[inversion.index[-1], 'section_end'] = end return TimeFrameGroup(inversion)
def _timeframe_for_chunk(there_are_more_subchunks, chunk_i, window_intersect, index): start = None end = None # Test if there are any more subchunks if there_are_more_subchunks: if chunk_i == 0: start = window_intersect.start elif chunk_i > 0: # This is the last subchunk end = window_intersect.end else: # Just a single 'subchunk' start = window_intersect.start end = window_intersect.end if start is None: start = index[0] if end is None: end = index[-1] return TimeFrame(start, end)
def load(self, key, cols=None, sections=None, n_look_ahead_rows=0, chunksize=MAX_MEM_ALLOWANCE_IN_BYTES, verbose=False): # TODO: calculate chunksize default based on physical # memory installed and number of columns # Make sure key has a slash at the front but not at the end. if key[0] != '/': key = '/' + key if len(key) > 1 and key[-1] == '/': key = key[:-1] # Make sure chunksize is an int otherwise `range` complains later. chunksize = np.int64(chunksize) # Set `sections` variable sections = [TimeFrame()] if sections is None else sections if isinstance(sections, pd.PeriodIndex): sections = timeframes_from_periodindex(sections) if verbose: print("HDFDataStore.load. key='{}'".format(key)) self.all_sections_smaller_than_chunksize = True for section in sections: if verbose: print(" ", section) window_intersect = self.window.intersect(section) if window_intersect.empty: data = pd.DataFrame() data.timeframe = section yield data continue terms = window_intersect.query_terms('window_intersect') if terms is None: section_start_i = 0 section_end_i = self.store.get_storer(key).nrows if section_end_i <= 1: data = pd.DataFrame() data.timeframe = section yield data continue else: try: coords = self.store.select_as_coordinates(key=key, where=terms) except AttributeError as e: if str(e) == ("'NoneType' object has no attribute " "'read_coordinates'"): raise KeyError("key '{}' not found".format(key)) else: raise n_coords = len(coords) if n_coords == 0: data = pd.DataFrame() data.timeframe = window_intersect yield data continue section_start_i = coords[0] section_end_i = coords[-1] del coords slice_starts = xrange(section_start_i, section_end_i, chunksize) n_chunks = int( np.ceil((section_end_i - section_start_i) / chunksize)) if n_chunks > 1: self.all_sections_smaller_than_chunksize = False print("n_chunks", n_chunks) for chunk_i, chunk_start_i in enumerate(slice_starts): chunk_end_i = chunk_start_i + chunksize there_are_more_subchunks = (chunk_i < n_chunks - 1) if chunk_end_i > section_end_i: chunk_end_i = section_end_i chunk_end_i += 1 data = self.store.select(key=key, columns=cols, start=chunk_start_i, stop=chunk_end_i) # if len(data) <= 2: # yield pd.DataFrame() # Load look ahead if necessary if n_look_ahead_rows > 0: if len(data.index) > 0: look_ahead_start_i = chunk_end_i look_ahead_end_i = look_ahead_start_i + n_look_ahead_rows try: data.look_ahead = self.store.select( key=key, columns=cols, start=look_ahead_start_i, stop=look_ahead_end_i) except ValueError: data.look_ahead = pd.DataFrame() else: data.look_ahead = pd.DataFrame() data.timeframe = _timeframe_for_chunk(there_are_more_subchunks, chunk_i, window_intersect, data.index) yield data del data
def _save_metadata_for_disaggregation( self, output_datastore, sample_period, measurement, timeframes, building, meters=None, num_meters=None, supervised=True, ): """Add metadata for disaggregated appliance estimates to datastore. This method returns nothing. It sets the metadata in `output_datastore`. Note that `self.MODEL_NAME` needs to be set to a string before calling this method. For example, we use `self.MODEL_NAME = 'CO'` for Combinatorial Optimisation. Parameters ---------- output_datastore : nilmtk.DataStore subclass object The datastore to write metadata into. sample_period : int The sample period, in seconds, used for both the mains and the disaggregated appliance estimates. measurement : 2-tuple of strings In the form (<physical_quantity>, <type>) e.g. ("power", "active") timeframes : list of nilmtk.TimeFrames or nilmtk.TimeFrameGroup The TimeFrames over which this data is valid for. building : int The building instance number (starting from 1) supervised : bool, defaults to True Is this a supervised NILM algorithm? meters : list of nilmtk.ElecMeters, optional Required if `supervised=True` num_meters : int Required if `supervised=False` """ # TODO: `preprocessing_applied` for all meters # TODO: submeter measurement should probably be the mains # measurement we used to train on, not the mains measurement. # DataSet and MeterDevice metadata: building_path = "/building{}".format(building) mains_data_location = building_path + "/elec/meter1" meter_devices = { self.MODEL_NAME: { "model": self.MODEL_NAME, "sample_period": sample_period, "max_sample_period": sample_period, "measurements": [{"physical_quantity": measurement[0], "type": measurement[1]}], }, "mains": { "model": "mains", "sample_period": sample_period, "max_sample_period": sample_period, "measurements": [{"physical_quantity": measurement[0], "type": measurement[1]}], }, } merged_timeframes = merge_timeframes(timeframes, gap=sample_period) total_timeframe = TimeFrame(merged_timeframes[0].start, merged_timeframes[-1].end) date_now = datetime.now().isoformat().split(".")[0] dataset_metadata = { "name": self.MODEL_NAME, "date": date_now, "meter_devices": meter_devices, "timeframe": total_timeframe.to_dict(), } output_datastore.save_metadata("/", dataset_metadata) # Building metadata # Mains meter: elec_meters = { 1: { "device_model": "mains", "site_meter": True, "data_location": mains_data_location, "preprocessing_applied": {}, # TODO "statistics": {"timeframe": total_timeframe.to_dict()}, } } def update_elec_meters(meter_instance): elec_meters.update( { meter_instance: { "device_model": self.MODEL_NAME, "submeter_of": 1, "data_location": ("{}/elec/meter{}".format(building_path, meter_instance)), "preprocessing_applied": {}, # TODO "statistics": {"timeframe": total_timeframe.to_dict()}, } } ) # Appliances and submeters: appliances = [] if supervised: for meter in meters: meter_instance = meter.instance() update_elec_meters(meter_instance) for app in meter.appliances: appliance = { "meters": [meter_instance], "type": app.identifier.type, "instance": app.identifier.instance # TODO this `instance` will only be correct when the # model is trained on the same house as it is tested on # https://github.com/nilmtk/nilmtk/issues/194 } appliances.append(appliance) # Setting the name if it exists if meter.name: if len(meter.name) > 0: elec_meters[meter_instance]["name"] = meter.name else: # Unsupervised # Submeters: # Starts at 2 because meter 1 is mains. for chan in range(2, num_meters + 2): update_elec_meters(meter_instance=chan) appliance = { "meters": [chan], "type": "unknown", "instance": chan - 1 # TODO this `instance` will only be correct when the # model is trained on the same house as it is tested on # https://github.com/nilmtk/nilmtk/issues/194 } appliances.append(appliance) building_metadata = {"instance": building, "elec_meters": elec_meters, "appliances": appliances} output_datastore.save_metadata(building_path, building_metadata)
def _get_good_sections(df, sample_period): """ Code copied from nilmtk[1]/nilmtk/stats/goodsections.py [1] https://github.com/nilmtk/nilmtk/ """ index = df.dropna().sort_index().index df_time_end = df.index[-1] + pd.Timedelta(seconds=sample_period) del df if len(index) < 2: return [] timedeltas_sec = timedelta64_to_secs(np.diff(index.values)) timedeltas_check = timedeltas_sec <= sample_period # Memory management del timedeltas_sec gc.collect() timedeltas_check = np.concatenate( [[False], timedeltas_check]) transitions = np.diff(timedeltas_check.astype(np.int)) # Memory management last_timedeltas_check = timedeltas_check[-1] del timedeltas_check gc.collect() good_sect_starts = list(index[:-1][transitions == 1]) good_sect_ends = list(index[:-1][transitions == -1]) # Memory management last_index = index[-1] del index gc.collect() # Work out if this chunk ends with an open ended good section if len(good_sect_ends) == 0: ends_with_open_ended_good_section = ( len(good_sect_starts) > 0) elif len(good_sect_starts) > 0: # We have good_sect_ends and good_sect_starts ends_with_open_ended_good_section = ( good_sect_ends[-1] < good_sect_starts[-1]) else: # We have good_sect_ends but no good_sect_starts ends_with_open_ended_good_section = False if ends_with_open_ended_good_section: good_sect_ends += [df_time_end] assert len(good_sect_starts) == len(good_sect_ends) sections = [TimeFrame(start, end) for start, end in zip(good_sect_starts, good_sect_ends) if not (start == end and start is not None)] # Memory management del good_sect_starts del good_sect_ends gc.collect() return sections
def add_metadata(self, output_datastore, measurement, timeframes, mains, timezone, load_kwargs): date_now = datetime.now().isoformat().split('.')[0] output_name = load_kwargs.pop('output_name', 'NILMTK_CO_' + date_now) resample_seconds = load_kwargs.pop('resample_seconds', 60) building_path = '/building{}'.format(mains.building()) mains_data_location = '{}/elec/meter1'.format(building_path) # DataSet and MeterDevice metadata: meter_devices = { 'CO': { 'model': 'CO', 'sample_period': resample_seconds, 'max_sample_period': resample_seconds, 'measurements': [{ 'physical_quantity': measurement[0], 'type': measurement[1] }] }, 'mains': { 'model': 'mains', 'sample_period': resample_seconds, 'max_sample_period': resample_seconds, 'measurements': [{ 'physical_quantity': measurement[0], 'type': measurement[1] }] } } merged_timeframes = merge_timeframes(timeframes, gap=resample_seconds) total_timeframe = TimeFrame(merged_timeframes[0].start, merged_timeframes[-1].end) dataset_metadata = {'name': output_name, 'date': date_now, 'meter_devices': meter_devices, 'timeframe': total_timeframe.to_dict(), 'timezone': timezone} output_datastore.save_metadata('/', dataset_metadata) # Building metadata # Mains meter: elec_meters = { 1: { 'device_model': 'mains', 'site_meter': True, 'data_location': mains_data_location, 'preprocessing_applied': {}, # TODO 'statistics': { 'timeframe': total_timeframe.to_dict(), 'good_sections': list_of_timeframe_dicts(merged_timeframes) } } } # Appliances and submeters: appliances = [] for model in self.model: meter = model['training_metadata'] meter_instance = meter.instance() for app in meter.appliances: meters = app.metadata['meters'] appliance = { 'meters': [meter_instance], 'type': app.identifier.type, 'instance': app.identifier.instance } appliances.append(appliance) elec_meters.update({ meter_instance: { 'device_model': 'CO', 'submeter_of': 1, 'data_location': ('{}/elec/meter{}' .format(building_path, meter_instance)), 'preprocessing_applied': {}, # TODO 'statistics': { 'timeframe': total_timeframe.to_dict(), 'good_sections': list_of_timeframe_dicts(merged_timeframes) } } }) #Setting the name if it exists if meter.name: if len(meter.name)>0: elec_meters[meter_instance]['name'] = meter.name building_metadata = { 'instance': mains.building(), 'elec_meters': elec_meters, 'appliances': appliances } output_datastore.save_metadata(building_path, building_metadata)
def _positionActivation(self, activation, application, building, windowLen, activationIndex, isReal=True): startTime = activation.index[0] endTime = activation.index[-1] if (len(activation) < windowLen): addnum = windowLen - len(activation) an = self.rng.randint(0, addnum) bn = addnum - an positioned_activation = np.pad(activation.values, pad_width=(an, 0), mode='constant') positioned_activation = np.pad(positioned_activation, pad_width=(0, bn), mode='constant') seq_start_time = activation.index[0] - timedelta(seconds=an * 6) index = pd.date_range(seq_start_time, periods=windowLen, freq="{:d}S".format(6)) if isReal: intersections = [] activationsnum = len( self.activationsApp[application][building]) if an > 0 and activationIndex >= 1: beforeStart = TimeFrame( startTime - timedelta(seconds=an * 6), startTime) ai = activationIndex - 1 beforeActivation = self.activationsApp[application][ building][ai] beforeSection = TimeFrame(beforeActivation.index[0], beforeActivation.index[-1]) intersection = beforeSection.intersection(beforeStart) while intersection.start != None and intersection.end != None: intersections.append(intersection) ai = ai - 1 if ai < 0: break beforeActivation = self.activationsApp[application][ building][ai] beforeSection = TimeFrame(beforeActivation.index[0], beforeActivation.index[-1]) intersection = beforeSection.intersection(beforeStart) if bn > 0 and activationIndex < activationsnum - 1: afterEnd = TimeFrame(endTime, endTime + timedelta(seconds=bn * 6)) bi = activationIndex + 1 afterActivation = self.activationsApp[application][ building][bi] afterSection = TimeFrame(afterActivation.index[0], afterActivation.index[-1]) intersection = afterSection.intersection(afterEnd) while intersection.start != None and intersection.end != None: intersections.append(intersection) bi = bi + 1 if bi >= activationsnum: break afterActivation = self.activationsApp[application][ building][bi] afterSection = TimeFrame(afterActivation.index[0], afterActivation.index[-1]) intersection = afterSection.intersection(afterEnd) for intersection in intersections: intersectionStart = intersection.start intersectionEnd = intersection.end length = int( (intersectionEnd - intersectionStart).total_seconds() / 6) + 1 offset = int( (intersectionStart - seq_start_time).total_seconds() / 6) positioned_activation[ offset:offset + length] = positioned_activation[ offset:offset + length] + self.elecApp[application][building][ intersectionStart:intersectionEnd].values positioned_activation_series = pd.Series(positioned_activation, index=index) else: positioned_activation_series = activation[:windowLen] if len(positioned_activation_series) != windowLen: logger.error("error") return positioned_activation_series
def __init__(self, **config): if 'filename' not in config.keys(): self.dataSet = nilmtk.DataSet("ukdale.h5") else: self.dataSet = nilmtk.DataSet(config['fileName']) if 'startTime' not in config.keys() or 'endTime' not in config.keys(): self.dataSet.set_window("2012-11-01", "2015-01-31") else: self.dataSet.set_window(config['startTime'], config['endTime']) if 'trainBuildings' not in config.keys(): self.trainBuildings = [1, 3, 4, 5] else: self.trainBuildings = config['trainBuildings'] if 'testBuildings' not in config.keys(): self.testBuildings = [2] else: self.testBuildings = config['testBuildings'] if 'applications' not in config.keys(): raise KeyError("please input applications") self.applications = config['applications'] if 'targetapplication' not in config.keys(): raise KeyError("please input targetapplication") self.targetApplication = config['targetapplication'] if 'randSeed' not in config.keys(): randSeed = 0 else: randSeed = config['randSeed'] self.otherApplications = [ i for i in self.applications if i not in [self.targetApplication] ] self.allBuildings = set(self.trainBuildings + self.testBuildings) self.window = 599 self.inputSeqs = [] self.targetSeqs = [] self.rng = np.random.RandomState(randSeed) activationConfig = { 'fridge': { 'min_off_duration': 18, # 12 in paper here 'min_on_duration': 60, 'on_power_threshold': 50, 'sample_period': 6, }, 'kettle': { 'min_off_duration': 18, # 0 in paper here 'min_on_duration': 12, 'on_power_threshold': 2000, 'sample_period': 6, }, 'washing machine': { 'min_off_duration': 160, 'min_on_duration': 1800, 'on_power_threshold': 20, 'sample_period': 6, }, 'microwave': { 'min_off_duration': 30, 'min_on_duration': 12, 'on_power_threshold': 200, 'sample_period': 6, }, 'dish washer': { 'min_off_duration': 1800, 'min_on_duration': 1800, 'on_power_threshold': 10, 'sample_period': 6, } } self.elecMains = {} self.goodSections = {} for building in self.allBuildings: self.goodSections[building] = self.dataSet.buildings[ building].elec.mains().good_sections() self.elecMains[building] = self.dataSet.buildings[ building].elec.mains().power_series_all_data( sample_period=6, sections=self.goodSections[building]).dropna() self.numApp = {} self.elecApp = {} self.activationsApp = {} self.activationAppSections = {} for app in self.applications: self.elecApp[app] = {} self.activationsApp[app] = {} self.numApp[app] = 0 self.activationAppSections[app] = {} for building in self.allBuildings: try: self.elecApp[app][building] = self.dataSet.buildings[ building].elec[app].power_series_all_data( sample_period=6).dropna() self.activationsApp[app][ building] = self.dataSet.buildings[building].elec[ app].get_activations(**activationConfig[app]) self.activationsApp[app][building] = [ activation.astype(np.float32) for activation in self.activationsApp[app][building] ] self.numApp[app] += len(self.activationsApp[app][building]) self.activationAppSections[app][building] = TimeFrameGroup( ) for activation in self.activationsApp[app][building]: self.activationAppSections[app][building].append( TimeFrame(activation.index[0], activation.index[-1])) except KeyError as exception: logger.info( str(building) + " has no " + app + ". Full exception: {}".format(exception)) continue logger.info("Done loading NILMTK data.") for building in self.allBuildings: activationsToRemove = [] try: activations = self.activationsApp[ self.targetApplication][building] mains = self.elecMains[building] for i, activation in enumerate(activations): activationDuration = (activation.index[-1] - activation.index[0]) start = (activation.index[0] - activationDuration) end = (activation.index[-1] + activationDuration) if start < mains.index[0] or end > mains.index[-1]: activationsToRemove.append(i) else: mainsForAct = mains[start:end] if not self._hasSufficientSamples( start, end, mainsForAct): activationsToRemove.append(i) activationsToRemove.reverse() for i in activationsToRemove: activations.pop(i) self.activationsApp[ self.targetApplication][building] = activations except KeyError as exception: continue self.sectionsWithNoTarget = {} for building in self.allBuildings: try: activationsTarget = self.activationsApp[ self.targetApplication][building] mainGoodSections = self.goodSections[building] mains = self.elecMains[building] gapsBetweenActivations = TimeFrameGroup() prev = mains.index[0] for activation in activationsTarget: try: p2 = prev gapsBetweenActivations.append( TimeFrame(prev, activation.index[0])) prev = activation.index[-1] p1 = activation.index[0] except ValueError: logger.debug("----------------------") logger.debug(p1) logger.debug(p2) logger.debug(activation.index[0]) logger.debug(activation.index[-1]) gapsBetweenActivations.append(TimeFrame(prev, mains.index[-1])) intersection = gapsBetweenActivations.intersection( mainGoodSections) intersection = intersection.remove_shorter_than(6 * self.window) self.sectionsWithNoTarget[building] = intersection except KeyError: continue
def load_nilmtk_activations( dataset_paths, target_appliance_name, appliance_names, on_power_threshold, min_on_duration, min_off_duration, sample_period, windows, sanity_check=1 ): """ Parameters ---------- windows : dict Structure example: { 'UKDALE': { 'train': {<building_i>: <window>}, 'unseen_activations_of_seen_appliances': {<building_i>: <window>}, 'unseen_appliances': {<building_i>: <window>} } } Returns ------- all_activations : dict Structure example: {<train | unseen_appliances | unseen_activations_of_seen_appliances>: { <appliance>: { <building_name>: [<activations>] }}} Each activation is a pd.Series with DatetimeIndex and the following metadata attributes: building, appliance, fold. """ logger.info("Loading NILMTK activations...") if sanity_check: # Sanity check for dataset in windows: check_windows(windows[dataset]) all_activations = {} for dataset_name, folds in windows.items(): # Load dataset dataset = nilmtk.DataSet(dataset_paths[dataset_name]) appliance_aliases = appliance_names[dataset_name][target_appliance_name] for fold, buildings_and_windows in folds.items(): logger.info( "Loading activations for fold {}.....".format(fold)) for building_i, windows_for_building in buildings_and_windows.items(): #dataset.set_window(*window) elec = dataset.buildings[building_i].elec building_name = ( dataset.metadata['name'] + '_building_{}'.format(building_i)) appliance_meters = [] for meter in elec.meters: if meter.is_site_meter(): continue append_meter = False for a in meter.appliances: if a.type['type'] in appliance_aliases: append_meter = True if append_meter: appliance_meters.append(meter) print(meter.appliances) if not appliance_meters: logger.info( "No {} found in {}".format(target_appliance_name, building_name)) continue #if appliance_meters: if len(appliance_meters) > 1: meter = nilmtk.MeterGroup(meters=appliance_meters) else: meter = appliance_meters[0] logger.info( "Loading {} for {}...".format(target_appliance_name, building_name)) meter_activations = [] for window in windows_for_building: if dataset_name == "ECO": dataset.store.window = TimeFrame(start=window[0], end=window[1], tz='GMT') else: dataset.set_window(*window) # does not work for ECO # Get activations_for_fold and process them meter_activations_for_building = meter.get_activations( sample_period=sample_period, min_off_duration=min_off_duration, min_on_duration=min_on_duration, on_power_threshold=on_power_threshold, resample_kwargs={'fill_method': 'ffill', 'how': 'mean', 'limit': 20}) #meter_activations_for_building = [activation.astype(np.float32) # for activation in meter_activations_for_building] meter_activations.extend(meter_activations_for_building) # Attach metadata #for activation in meter_activations: # activation._metadata = copy(activation._metadata) # activation._metadata.extend( # ["building", "appliance", "fold"]) # activation.building = building_name # activation.appliance = appliance # activation.fold = fold # Save if meter_activations: all_activations.setdefault( fold, {}).setdefault( target_appliance_name, {})[building_name] = meter_activations logger.info( "Loaded {} {} activations from {}." .format(len(meter_activations), target_appliance_name, building_name)) dataset.store.close() logger.info("Done loading NILMTK activations.") return all_activations
def load_data_from_nilmtk_datasets(windows, dataset_paths, appliances, target_appliance_name, sample_period): data = {} data_good_sections = {} logger.info("Loading NILMTK data...") for dataset_name, folds in windows.items(): # Load dataset dataset = nilmtk.DataSet(dataset_paths[dataset_name]) for fold, buildings_and_windows in folds.items(): for building_i, windows_for_building in buildings_and_windows.items(): dataset.set_window(None, None) elec = dataset.buildings[building_i].elec building_name = ( dataset.metadata['name'] + '_building_{}'.format(building_i)) logger.info( "Loading data for {}...".format(building_name)) mains_meter = elec.mains() good_sections = get_effective_good_sections(mains_meter) appliance_aliases = appliances[dataset_name][target_appliance_name] appliance_meters = [] for meter in elec.meters: if meter.is_site_meter(): continue if len(meter.appliances) == 1: appliancetype = meter.appliances[0].type['type'] if appliancetype in appliance_aliases: appliance_meters.append(meter) else: append_meter = False for a in meter.appliances: if a.type['type'] in appliance_aliases: append_meter = True if append_meter: appliance_meters.append(meter) print(meter.appliances) if not appliance_meters: logger.info( "No {} found in {}".format(target_appliance_name, building_name)) continue if len(appliance_meters) > 1: appliance_metergroup = nilmtk.MeterGroup(meters=appliance_meters) else: appliance_metergroup = appliance_meters[0] data_good_sections.setdefault(fold, {})[building_name] = good_sections def load_data(meter): df = meter.power_series_all_data( sample_period=sample_period ) if df is not None: return df.astype(np.float32).dropna() else: return None dfs = [] for window in windows_for_building: if dataset_name == "ECO": dataset.store.window = TimeFrame(start=window[0], end=window[1], tz='GMT') else: if window is None: ipdb.set_trace() # Something has gone wrong...see what happend! dataset.set_window(*window) # does not work for ECO #ipdb.set_trace() mains_data = load_data(mains_meter) appliance_data = load_data(appliance_metergroup) if (mains_data is None) or (appliance_data is None): continue df = pd.DataFrame( {'mains': mains_data, 'target': appliance_data}, dtype=np.float32).dropna() del mains_data del appliance_data if not df.empty: dfs.append(df) df = pd.concat(dfs, axis=0) dfs = [] for gs in good_sections: dfslice = gs.slice(df) if not dfslice.empty: dfs.append(dfslice) df = pd.concat(dfs, axis=0) if not df.empty: data.setdefault(fold, {})[building_name] = df logger.info( "Loaded data from building {} for fold {}" " from {} to {}." .format(building_name, fold, df.index[0], df.index[-1])) dataset.store.close() logger.info("Done loading NILMTK data.") return data, data_good_sections
def test_intersection(self): tf = TimeFrame("2012-01-01 00:00:00", "2013-01-01 00:00:00") self.assertFalse(tf.empty) new_tf = tf.intersection(tf) self.assertEqual(tf, new_tf) self.assertFalse(new_tf.empty) new_tf = tf.intersection(TimeFrame()) self.assertEqual(tf, new_tf) self.assertFalse(new_tf.empty) new_tf = tf.intersection(TimeFrame(start="1990-01-01")) self.assertEqual(tf, new_tf) self.assertFalse(new_tf.empty) new_tf = tf.intersection(TimeFrame(end="2100-01-01")) self.assertEqual(tf, new_tf) self.assertFalse(new_tf.empty) small_tf = TimeFrame("2012-01-05 00:00:00", "2012-01-06 00:00:00") new_tf = tf.intersection(small_tf) self.assertEqual(small_tf, new_tf) self.assertFalse(new_tf.empty) large_tf = TimeFrame("2010-01-01 00:00:00", "2014-01-01 00:00:00") new_tf = tf.intersection(large_tf) self.assertEqual(tf, new_tf) self.assertFalse(new_tf.empty) disjoint = TimeFrame("2015-01-01", "2016-01-01") new_tf = tf.intersection(disjoint) self.assertTrue(new_tf.empty) # try intersecting with emtpy TF new_tf = tf.intersection(new_tf) self.assertTrue(new_tf.empty) disjoint = TimeFrame("2015-01-01", "2016-01-01") tf.enabled = False new_tf = tf.intersection(disjoint) self.assertEqual(new_tf, disjoint) self.assertFalse(new_tf.empty) tf.enabled = True # crop into the start of tf new_start = "2012-01-05 04:05:06" new_tf = tf.intersection(TimeFrame(start=new_start, end="2014-01-01")) self.assertEqual(new_tf, TimeFrame(start=new_start, end=tf.end)) self.assertFalse(new_tf.empty) # crop into the end of tf new_end = "2012-01-07 04:05:06" new_tf = tf.intersection(TimeFrame(start="2011-01-01", end=new_end)) self.assertEqual(new_tf, TimeFrame(start=tf.start, end=new_end)) self.assertFalse(new_tf.empty)
def load(self, key, columns=None, sections=None, n_look_ahead_rows=0, chunksize=MAX_MEM_ALLOWANCE_IN_BYTES, verbose=False): # TODO: calculate chunksize default based on physical # memory installed and number of columns # Make sure key has a slash at the front but not at the end. if key[0] != '/': key = '/' + key if len(key) > 1 and key[-1] == '/': key = key[:-1] # Make sure chunksize is an int otherwise `range` complains later. chunksize = np.int64(chunksize) # Set `sections` variable sections = [TimeFrame()] if sections is None else sections sections = TimeFrameGroup(sections) # Replace any Nones with '' in columns: if columns is not None: columns = [('' if pq is None else pq, '' if ac is None else ac) for pq, ac in columns] if verbose: print("HDFDataStore.load(key='{}', columns='{}', sections='{}'," " n_look_ahead_rows='{}', chunksize='{}')".format( key, columns, sections, n_look_ahead_rows, chunksize)) self.all_sections_smaller_than_chunksize = True for section in sections: if verbose: print(" ", section) window_intersect = self.window.intersection(section) if window_intersect.empty: data = pd.DataFrame() data.timeframe = section yield data continue terms = window_intersect.query_terms('window_intersect') if terms is None: section_start_i = 0 section_end_i = self.store.get_storer(key).nrows if section_end_i <= 1: data = pd.DataFrame() data.timeframe = section yield data continue else: try: coords = self.store.select_as_coordinates(key=key, where=terms) except AttributeError as e: if str(e) == ("'NoneType' object has no attribute " "'read_coordinates'"): raise KeyError("key '{}' not found".format(key)) else: raise n_coords = len(coords) if n_coords == 0: data = pd.DataFrame() data.timeframe = window_intersect yield data continue section_start_i = coords[0] section_end_i = coords[-1] del coords slice_starts = range(section_start_i, section_end_i, chunksize) n_chunks = int( np.ceil((section_end_i - section_start_i) / chunksize)) if n_chunks > 1: self.all_sections_smaller_than_chunksize = False for chunk_i, chunk_start_i in enumerate(slice_starts): chunk_end_i = chunk_start_i + chunksize there_are_more_subchunks = (chunk_i < n_chunks - 1) if chunk_end_i > section_end_i: chunk_end_i = section_end_i chunk_end_i += 1 data = self.store.select(key=key, columns=columns, start=chunk_start_i, stop=chunk_end_i) # if len(data) <= 2: # yield pd.DataFrame() # Load look ahead if necessary if n_look_ahead_rows > 0: if len(data.index) > 0: look_ahead_start_i = chunk_end_i look_ahead_end_i = look_ahead_start_i + n_look_ahead_rows try: look_ahead = self.store.select( key=key, columns=columns, start=look_ahead_start_i, stop=look_ahead_end_i) except ValueError: look_ahead = pd.DataFrame() else: look_ahead = pd.DataFrame() with warnings.catch_warnings(): # Silence "Pandas doesn't allow columns to be created via a new attribute name" # since we're not adding a column warnings.filterwarnings( 'ignore', category=UserWarning, message=".*Pandas doesn't allow columns.*") setattr(data, 'look_ahead', look_ahead) data.timeframe = _timeframe_for_chunk(there_are_more_subchunks, chunk_i, window_intersect, data.index) yield data del data
def test_adjacent(self): # overlap tf1 = TimeFrame("2011-01-01 00:00:00", "2011-02-01 00:00:00") tf2 = TimeFrame("2011-02-01 00:00:00", "2011-03-01 00:00:00") self.assertTrue(tf1.adjacent(tf2)) self.assertTrue(tf2.adjacent(tf1)) # no overlap tf1 = TimeFrame("2011-01-01 00:00:00", "2011-02-01 00:00:00") tf2 = TimeFrame("2011-02-01 00:00:01", "2011-03-01 00:00:00") self.assertFalse(tf1.adjacent(tf2)) self.assertFalse(tf2.adjacent(tf1)) # no overlap but gap specified tf1 = TimeFrame("2011-01-01 00:00:00", "2011-02-01 00:00:00") tf2 = TimeFrame("2011-02-01 00:00:01", "2011-03-01 00:00:00") self.assertTrue(tf1.adjacent(tf2, gap=1)) self.assertTrue(tf2.adjacent(tf1, gap=1)) self.assertTrue(tf1.adjacent(tf2, gap=100)) self.assertTrue(tf2.adjacent(tf1, gap=100))