def finalize(self): """ Merges together any nonzero sections which span multiple segments. Whether there are gaps in between does not matter. Returns ------- sections : TimeFrameGroup (a subclass of Python's list class) """ # Merge the results of all chunks starts = [] ends = [] for index, row in self._data.iterrows(): starts.append(row['sections']['start']) ends.append(row['sections']['end']) if len(starts) == 0 == len(ends): self._data = TimeFrameGroup() return starts = pd.concat(starts) ends = pd.concat(ends) rate = pd.Timedelta(seconds=self.max_sample_rate) self._data = TimeFrameGroup(starts_and_ends={ 'starts': starts, 'ends': ends }) #.merge_shorter_gaps_than(rate) TODO: Merge needed?
def import_from_cache(self, cached_stat, sections): ''' Stores the statistic into the cache of the nilmtk.DataStore Note ---- I do not know whether this is still an issue: HIER IST DAS PROBLEM BEIM STATISTIKEN LESEN! DIE WERDEN CHUNK Weise GESPEICHERT, aber hier wird auf das Vorhandensein der gesamten Section als ganzes vertraut. ''' self._data = TimeFrameGroup(cached_stat)
def finalize(self): """Merges together any good sections which span multiple segments, as long as those segments are adjacent (previous.end - max_sample_period <= next.start <= previous.end). This may happen if we merge cached sections and noncached sections. Returns ------- sections : TimeFrameGroup (a subclass of Python's list class) """ sections = TimeFrameGroup() end_date_of_prev_row = None for index, row in self._data.iterrows(): row_sections = row['sections'] # Check if first TimeFrame of row_sections needs to be merged with # last TimeFrame of previous section if (end_date_of_prev_row is not None): rows_are_adjacent = ( (end_date_of_prev_row - self.max_sample_period_td) <= index <= end_date_of_prev_row) if rows_are_adjacent and row_sections[0].start is None: assert sections[-1].end is None sections._df.iloc[-1, 1] = row_sections[ 0].end # HIER MUSSTE ICH AUFPASSEN, DASS ICH UEBERSCHREIBE! row_sections.pop(0) else: # row_sections[0] and sections[-1] were not in adjacent chunks # so check if they are both open-ended and close them... if sections and sections[-1].end is None: try: sections[-1].end = end_date_of_prev_row except ValueError: # end_date_of_prev_row before sections[-1].start pass if row_sections and row_sections[0].start is None: try: row_sections[0].start = index except ValueError: pass end_date_of_prev_row = row['end'] sections.extend(row_sections) if sections and sections.count() > 0: sections[-1].include_end = True if sections[-1].end is None: sections[ -1, 1] = end_date_of_prev_row # HIER MUSSTE ICH AUFPASSEN, DASS ICH UEBERSCHREIBE! sections._df.reset_index(drop=True, inplace=True) self._data = sections
def combined(self): """Merges together any good sections which span multiple segments, as long as those segments are adjacent (previous.end - max_sample_period <= next.start <= previous.end). Returns ------- sections : TimeFrameGroup (a subclass of Python's list class) """ sections = TimeFrameGroup() end_date_of_prev_row = None for index, row in self._data.iterrows(): row_sections = row['sections'] # Check if first TimeFrame of row_sections needs to be merged with # last TimeFrame of previous section if (end_date_of_prev_row is not None): rows_are_adjacent = ( (end_date_of_prev_row - self.max_sample_period_td) <= index <= end_date_of_prev_row) if rows_are_adjacent and row_sections[0].start is None: assert sections[-1].end is None sections[-1].end = row_sections[0].end row_sections.pop(0) else: # row_sections[0] and sections[-1] were not in adjacent chunks # so check if they are both open-ended and close them... if sections and sections[-1].end is None: try: sections[-1].end = end_date_of_prev_row except ValueError: # end_date_of_prev_row before sections[-1].start pass if row_sections and row_sections[0].start is None: try: row_sections[0].start = index except ValueError: pass end_date_of_prev_row = row['end'] sections.extend(row_sections) if sections: sections[-1].include_end = True if sections[-1].end is None: sections[-1].end = end_date_of_prev_row return sections
def load(self, key, columns=None, sections=None, n_look_ahead_rows=0, chunksize=MAX_MEM_ALLOWANCE_IN_BYTES): file_path = self._key_to_abs_path(key) # Set `sections` variable sections = [TimeFrame()] if sections is None else sections sections = TimeFrameGroup(sections) self.all_sections_smaller_than_chunksize = True # iterate through parameter sections # requires 1 pass through file for each section for section in sections: window_intersect = self.window.intersection(section) header_rows = [0,1] text_file_reader = pd.read_csv(file_path, index_col=0, header=header_rows, parse_dates=True, chunksize=chunksize) # iterate through all chunks in file for chunk_idx, chunk in enumerate(text_file_reader): # filter dataframe by specified columns if columns: chunk = chunk[columns] # mask chunk by window and section intersect subchunk_idx = [True]*len(chunk) if window_intersect.start: subchunk_idx = np.logical_and(subchunk_idx, (chunk.index>=window_intersect.start)) if window_intersect.end: subchunk_idx = np.logical_and(subchunk_idx, (chunk.index<window_intersect.end)) if window_intersect.empty: subchunk_idx = [False]*len(chunk) subchunk = chunk[subchunk_idx] if len(subchunk)>0: subchunk_end = np.max(np.nonzero(subchunk_idx)) subchunk.timeframe = TimeFrame(subchunk.index[0], subchunk.index[-1]) # Load look ahead if necessary if n_look_ahead_rows > 0: if len(subchunk.index) > 0: rows_to_skip = (len(header_rows)+1)+(chunk_idx*chunksize)+subchunk_end+1 try: subchunk.look_ahead = pd.read_csv(file_path, index_col=0, header=None, parse_dates=True, skiprows=rows_to_skip, nrows=n_look_ahead_rows) except ValueError: subchunk.look_ahead = pd.DataFrame() else: subchunk.look_ahead = pd.DataFrame() yield subchunk
def _find_sections_with_no_target(self): """Finds the intersections of the mains good sections with the gaps between target appliance activations. """ self.sections_with_no_target = {} seq_length_secs = self.seq_length * self.sample_period for fold, sects_per_building in self.mains_good_sections.items(): for building, good_sections in sects_per_building.items(): activations = ( self.activations[fold][self.target_appliance][building]) mains = self.mains[fold][building] mains_good_sections = self.mains_good_sections[fold][building] gaps_between_activations = TimeFrameGroup() prev_end = mains.index[0] for activation in activations: gap = TimeFrame(prev_end, activation.index[0]) gaps_between_activations.append(gap) prev_end = activation.index[-1] gap = TimeFrame(prev_end, mains.index[-1]) gaps_between_activations.append(gap) intersection = ( gaps_between_activations.intersection(mains_good_sections)) intersection = intersection.remove_shorter_than( seq_length_secs) self.sections_with_no_target.setdefault( fold, {})[building] = (intersection) logger.info( "Found {} sections without target for {} {}.".format( len(intersection), fold, building))
def _find_sections_with_no_target(self): """Finds the intersections of the mains good sections with the gaps between target appliance activations. """ self.sections_with_no_target = {} seq_length_secs = self.seq_length * self.sample_period for fold, sects_per_building in self.data_good_sections.items(): for building, good_sections in sects_per_building.items(): if building not in self.all_activations[fold][self.target_appliance]: continue activations = ( self.all_activations[fold][self.target_appliance][building]) data = self.data[fold][building] data_good_sections = good_sections gaps_between_activations = TimeFrameGroup() prev_end = data.index[0] for activation in activations: activation_start = activation.start if prev_end < activation_start: gap = TimeFrame(prev_end, activation_start) gaps_between_activations.append(gap) prev_end = activation.end data_end = data.index[-1] if prev_end < data_end: gap = TimeFrame(prev_end, data_end) gaps_between_activations.append(gap) intersection = ( gaps_between_activations.intersection(data_good_sections)) intersection = intersection.remove_shorter_than( seq_length_secs) self.sections_with_no_target.setdefault(fold, {})[building] = ( intersection) logger.info("Found {} sections without target for {} {}." .format(len(intersection), fold, building))
def append(self, timeframe, new_results): """Append a single result. Parameters ---------- timeframe : nilmtk.TimeFrame new_results : {'sections': list of TimeFrame objects} """ new_results['sections'] = [TimeFrameGroup(new_results['sections'][0])] super(AboveFreqSectionsResults, self).append(timeframe, new_results)
def _delete_phony_sections(self): filtered_data = {} for fold, data_per_building in self.data.items(): for building, data in data_per_building.items(): if building not in self.phony_active_timeframes[fold][self.target_appliance]: continue activations = ( self.phony_active_timeframes[fold][self.target_appliance][building]) data_between_phony_activations = TimeFrameGroup() prev_end = data.index[0] for activation in activations: activation_start = activation.start if prev_end < activation_start: gap = TimeFrame(prev_end, activation_start) data_between_phony_activations.append(gap) prev_end = activation.end data_end = data.index[-1] + pd.Timedelta(seconds=self.sample_period) if prev_end < data_end: gap = TimeFrame(prev_end, data_end) data_between_phony_activations.append(gap) dfs = [] for section in data_between_phony_activations: dfs.append(section.slice(data)) data = pd.concat(dfs) filtered_data.setdefault(fold, {})[building] = ( data) logger.info("Found {} good sections for {} {}." .format(len(data_between_phony_activations), fold, building)) self.data = filtered_data
def _find_sections_with_no_target(self): """Finds the intersections of the mains good sections with the gaps between target appliance activations. """ self.sections_with_no_target = {} seq_length_secs = self.seq_length * self.sample_period for fold, sects_per_building in self.mains_good_sections.iteritems(): for building, good_sections in sects_per_building.iteritems(): activations = ( self.activations[fold][self.target_appliance][building]) mains = self.mains[fold][building] mains_good_sections = self.mains_good_sections[fold][building] gaps_between_activations = TimeFrameGroup() prev_end = mains.index[0] for activation in activations: gap = TimeFrame(prev_end, activation.index[0]) gaps_between_activations.append(gap) prev_end = activation.index[-1] gap = TimeFrame(prev_end, mains.index[-1]) gaps_between_activations.append(gap) intersection = ( gaps_between_activations.intersection(mains_good_sections)) intersection = intersection.remove_shorter_than( seq_length_secs) self.sections_with_no_target.setdefault(fold, {})[building] = ( intersection) logger.info("Found {} sections without target for {} {}." .format(len(intersection), fold, building))
def _load_mains_into_memory(self): logger.info("Loading NILMTK mains...") # Load dataset #dataset = nilmtk.DataSet(self.filename) self.mains = {} self.mains_good_sections = {} for fold, buildings_and_windows in self.windows.items(): for building_i, window in buildings_and_windows.items(): dataset=load_mains_dataset(int(building_i)) dataset=set_window(dataset, window[0], window[1]) #elec = dataset.buildings[building_i].elec building_name = ( 'REDD' + '_building_{}'.format(building_i)) logger.info( "Loading mains for {}...".format(building_name)) #mains_meter = elec.mains() good_sections_interval = find_good_sections(dataset,30) good_sections_interval = TimeFrameGroup(good_sections_interval) #meter = dataset resample_kwargs={} resample_kwargs['rule'] = '{:d}S'.format(self.sample_period) dataset=dataset['power']['apparent'] dataset=safe_resample(dataset,resample_kwargs) dataset=dataset.agg(np.mean) dataset=dataset.interpolate() mains_data = power_series_all_data(dataset, good_sections_interval).dropna() print(mains_data.index[0]) print(fold) print(building_i) def set_mains_data(dictionary, data): dictionary.setdefault(fold, {})[building_name] = data if not mains_data.empty: set_mains_data(self.mains, mains_data) set_mains_data(self.mains_good_sections, good_sections_interval) logger.info( "Loaded mains data from building {} for fold {}" " from {} to {}." .format(building_name, fold, mains_data.index[0], mains_data.index[-1])) #dataset.store.close() logger.info("Done loading NILMTK mains data.")
def __init__(self, **config): if 'filename' not in config.keys(): self.dataSet = nilmtk.DataSet("ukdale.h5") else: self.dataSet = nilmtk.DataSet(config['fileName']) if 'startTime' not in config.keys() or 'endTime' not in config.keys(): self.dataSet.set_window("2012-11-01", "2015-01-31") else: self.dataSet.set_window(config['startTime'], config['endTime']) if 'trainBuildings' not in config.keys(): self.trainBuildings = [1, 3, 4, 5] else: self.trainBuildings = config['trainBuildings'] if 'testBuildings' not in config.keys(): self.testBuildings = [2] else: self.testBuildings = config['testBuildings'] if 'applications' not in config.keys(): raise KeyError("please input applications") self.applications = config['applications'] if 'targetapplication' not in config.keys(): raise KeyError("please input targetapplication") self.targetApplication = config['targetapplication'] if 'randSeed' not in config.keys(): randSeed = 0 else: randSeed = config['randSeed'] self.otherApplications = [ i for i in self.applications if i not in [self.targetApplication] ] self.allBuildings = set(self.trainBuildings + self.testBuildings) self.window = 599 self.inputSeqs = [] self.targetSeqs = [] self.rng = np.random.RandomState(randSeed) activationConfig = { 'fridge': { 'min_off_duration': 18, # 12 in paper here 'min_on_duration': 60, 'on_power_threshold': 50, 'sample_period': 6, }, 'kettle': { 'min_off_duration': 18, # 0 in paper here 'min_on_duration': 12, 'on_power_threshold': 2000, 'sample_period': 6, }, 'washing machine': { 'min_off_duration': 160, 'min_on_duration': 1800, 'on_power_threshold': 20, 'sample_period': 6, }, 'microwave': { 'min_off_duration': 30, 'min_on_duration': 12, 'on_power_threshold': 200, 'sample_period': 6, }, 'dish washer': { 'min_off_duration': 1800, 'min_on_duration': 1800, 'on_power_threshold': 10, 'sample_period': 6, } } self.elecMains = {} self.goodSections = {} for building in self.allBuildings: self.goodSections[building] = self.dataSet.buildings[ building].elec.mains().good_sections() self.elecMains[building] = self.dataSet.buildings[ building].elec.mains().power_series_all_data( sample_period=6, sections=self.goodSections[building]).dropna() self.numApp = {} self.elecApp = {} self.activationsApp = {} self.activationAppSections = {} for app in self.applications: self.elecApp[app] = {} self.activationsApp[app] = {} self.numApp[app] = 0 self.activationAppSections[app] = {} for building in self.allBuildings: try: self.elecApp[app][building] = self.dataSet.buildings[ building].elec[app].power_series_all_data( sample_period=6).dropna() self.activationsApp[app][ building] = self.dataSet.buildings[building].elec[ app].get_activations(**activationConfig[app]) self.activationsApp[app][building] = [ activation.astype(np.float32) for activation in self.activationsApp[app][building] ] self.numApp[app] += len(self.activationsApp[app][building]) self.activationAppSections[app][building] = TimeFrameGroup( ) for activation in self.activationsApp[app][building]: self.activationAppSections[app][building].append( TimeFrame(activation.index[0], activation.index[-1])) except KeyError as exception: logger.info( str(building) + " has no " + app + ". Full exception: {}".format(exception)) continue logger.info("Done loading NILMTK data.") for building in self.allBuildings: activationsToRemove = [] try: activations = self.activationsApp[ self.targetApplication][building] mains = self.elecMains[building] for i, activation in enumerate(activations): activationDuration = (activation.index[-1] - activation.index[0]) start = (activation.index[0] - activationDuration) end = (activation.index[-1] + activationDuration) if start < mains.index[0] or end > mains.index[-1]: activationsToRemove.append(i) else: mainsForAct = mains[start:end] if not self._hasSufficientSamples( start, end, mainsForAct): activationsToRemove.append(i) activationsToRemove.reverse() for i in activationsToRemove: activations.pop(i) self.activationsApp[ self.targetApplication][building] = activations except KeyError as exception: continue self.sectionsWithNoTarget = {} for building in self.allBuildings: try: activationsTarget = self.activationsApp[ self.targetApplication][building] mainGoodSections = self.goodSections[building] mains = self.elecMains[building] gapsBetweenActivations = TimeFrameGroup() prev = mains.index[0] for activation in activationsTarget: try: p2 = prev gapsBetweenActivations.append( TimeFrame(prev, activation.index[0])) prev = activation.index[-1] p1 = activation.index[0] except ValueError: logger.debug("----------------------") logger.debug(p1) logger.debug(p2) logger.debug(activation.index[0]) logger.debug(activation.index[-1]) gapsBetweenActivations.append(TimeFrame(prev, mains.index[-1])) intersection = gapsBetweenActivations.intersection( mainGoodSections) intersection = intersection.remove_shorter_than(6 * self.window) self.sectionsWithNoTarget[building] = intersection except KeyError: continue
def _get_stat_from_cache_or_compute(self, nodes, results_obj, loader_kwargs): """General function for computing statistics and/or loading them from cache. Cached statistics lives in the DataStore at 'building<I>/elec/cache/meter<K>/<statistic_name>' e.g. 'building1/elec/cache/meter1/total_energy'. We store the 'full' statistic... i.e we store a representation of the `Results._data` DataFrame. Some times we need to do some conversion to store `Results._data` on disk. The logic for doing this conversion lives in the `Results` class or subclass. The cache can be cleared by calling `ElecMeter.clear_cache()`. When 'preprocessing' is set, then the cache is not used because the cache is only valid for the version without preprocessing. Parameters ---------- nodes : list of nilmtk.Node classes results_obj : instance of nilmtk.Results subclass. This is THE result instance which is afterwards filled by all the results coming from the different chunks. loader_kwargs : dict Returns ------- if `full_results` is True then return nilmtk.Results subclass instance otherwise return nilmtk.Results.simple(). See Also -------- clear_cache _compute_stat key_for_cached_stat get_cached_stat """ full_results = loader_kwargs.pop('full_results', False) verbose = loader_kwargs.get('verbose') if 'ac_type' in loader_kwargs or 'physical_quantity' in loader_kwargs: loader_kwargs = self._convert_physical_quantity_and_ac_type_to_cols( **loader_kwargs) columns = loader_kwargs.get('columns', []) ac_types = set([m[1] for m in columns if m[1]]) results_obj_copy = deepcopy(results_obj) # Prepare `sections` list sections = loader_kwargs.get('sections') if sections is None: tf = self.get_timeframe() tf.include_end = True sections = [tf] sections = TimeFrameGroup(sections) # Takes care that NILMTK timeframe sections = [s for s in sections if not s.empty] # Retrieve usable stats from cache key_for_cached_stat = self.key_for_cached_stat(results_obj.name) cached_stat = None if loader_kwargs.get('preprocessing') is None: cached_stat = self.get_cached_stat(key_for_cached_stat) #results_obj.import_from_cache(cached_stat, sections) # Fill results_obj with cache #def find_sections_to_compute(): # # Get sections_to_compute # results_obj_timeframes = results_obj.timeframes() # sections_to_compute = set(sections) - set(results_obj_timeframes) # t1 = TimeFrameGroup(sections) # t2 = TimeFrameGroup(results_obj_timeframes) # sections_to_compute = t1.diff(t2) # HIER IST DAS DIFF, DAS ICH NEU GEBAUT HABE!!! NUR WARUM GEHT ES GERADE NICHT MEHR??? # sections_to_compute = sorted(sections_to_compute) # return sections_to_compute #try: # ac_type_keys = results_obj.keys() #.simple().keys() #except: # sections_to_compute = find_sections_to_compute() #else: # if ac_types.issubset(ac_type_keys): # # IF ac_type in cache, only calculate remaining sections # sections_to_compute = find_sections_to_compute() # else: # # If false ac_type cached, still have to compute all # sections_to_compute = sections # results_obj = results_obj_copy #else: # sections_to_compute = sections if verbose and not cached_stat is None: #._data.empty: print("Using cached result.") # If necessary compute stats for missing sections if cached_stat is None: #sections_to_compute: # If we need everything either way, we don't need expensive index lookup during load #if not self.get_timeframe() in sections_to_compute: # loader_kwargs['sections'] = sections_to_compute #computed_result = self._compute_stat(nodes, loader_kwargs) # Merge newly computed stats into the main stat result # DAS HIER BAUT MAN BESSER DIREKT IN DEN NODE EIN!!! DASS SEIN RESULT ERWEITERT WIRD # DANN KANN MAN IMMER NOCH DAS RESULT VOM CACHING NEHMEN UND VERBINDEN! # MAN SETZT DANN VON ALLEN NODE ELEMENTEN DAS RESULT MIT IN DIE PIPELINE ELEMENTE! # DANN KANN MAN SIE HINTEN RAUSHOLEN. SO KANN MAN DIE BERECHNUNG IN EINEM ZUG MACHEN. # => Ist ja so gemacht. Nur eben fuer jede Section! # => Die einzige rweiterung waere das durchreichen von Results. #results_obj.update(computed_result.results) results_obj = self._compute_stat(nodes, loader_kwargs).results # For Nonzero section exclude where there are not good sections if results_obj.name == 'nonzero_sections' or results_obj.name == 'overbasepower_sections': good_sections = self.good_sections(**loader_kwargs) #_data results_obj._data = results_obj._data.intersection( good_sections) # Save to disk newly computed stats stat_for_store = results_obj.export_to_cache() try: #self.store.remove(key_for_cached_stat) self.store.put(key_for_cached_stat, stat_for_store, fixed=True) # Temporary workarround to store the good sections also for the other meters TODO if results_obj.name == 'good_sections': for i in range(2, 4): self.store.put(key_for_cached_stat.replace( 'meter1', 'meter' + str(i)), stat_for_store, fixed=True) except ValueError: # the old table probably had different columns self.store.remove(key_for_cached_stat) self.store.put(key_for_cached_stat, results_obj.export_to_cache()) else: results_obj.import_from_cache( cached_stat, sections) # Fill results_obj with cache # Return the correct value depending on options if full_results: return results_obj res = results_obj #.simple() if ac_types: try: ac_type_keys = res.keys() except: return res else: if res.empty: return res else: return pd.Series(res[ac_types], index=ac_types) return res._data
class OverBasepowerSectionsResults(Results): """ The result of the Non zero section statistic. Attributes ---------- _data : pd.DataFrame index is start date for the whole chunk `end` is end date for the whole chunk `sections` is a TimeFrameGroups object (a list of nilmtk.TimeFrame objects) """ name = "overbasepower_sections" def __init__(self, max_sample_rate): # Used to know when to combine self.max_sample_rate = max_sample_rate super(OverBasepowerSectionsResults, self).__init__() def append(self, timeframe, new_results): """Append a single result. Parameters ---------- timeframe : nilmtk.TimeFrame new_results : {'sections': list of TimeFrame objects} """ super(OverBasepowerSectionsResults, self).append(timeframe, new_results) def finalize(self): """ Merges together any nonzero sections which span multiple segments. Whether there are gaps in between does not matter. Returns ------- sections : TimeFrameGroup (a subclass of Python's list class) """ # Merge the results of all chunks starts = [] ends = [] for index, row in self._data.iterrows(): starts.append(row['sections']['start']) ends.append(row['sections']['end']) if len(starts) == 0 == len(ends): self._data = TimeFrameGroup() return starts = pd.concat(starts) ends = pd.concat(ends) rate = pd.Timedelta(seconds=self.max_sample_rate) self._data = TimeFrameGroup(starts_and_ends={ 'starts': starts, 'ends': ends }) #.merge_shorter_gaps_than(rate) TODO: Merge needed? def unify(self, other): raise Exception("Did not try this yet for the new nonzeroresults") super(OverBasepowerSectionsResults, self).unify(other) for start, row in self._data.iterrows(): other_sections = other._data['sections'].loc[start] intersection = row['sections'].intersection(other_sections) self._data['sections'].loc[start] = intersection def to_dict(self): overbasepower_sections = self._data overbasepower_sections_list_of_dicts = [ timeframe.to_dict() for timeframe in overbasepower_sections ] return { 'statistics': { 'overbasepower_sections': overbasepower_sections_list_of_dicts } } def plot(self, **plot_kwargs): timeframes = self return timeframes.plot(**plot_kwargs) def import_from_cache(self, cached_stat, sections): ''' Stores the statistic into the cache of the nilmtk.DataStore Note ---- I do not know whether this is still an issue: HIER IST DAS PROBLEM BEIM STATISTIKEN LESEN! DIE WERDEN CHUNK Weise GESPEICHERT, aber hier wird auf das Vorhandensein der gesamten Section als ganzes vertraut. ''' self._data = TimeFrameGroup(cached_stat) def export_to_cache(self): """ Returns the DataFrame to be written into cache. Returns ------- df: pd.DataFrame With three columns: 'end', 'section_end', 'section_start. """ return self._data._df
def import_from_cache(self, cached_stat, sections): self._data = TimeFrameGroup(cached_stat)
def _save_metadata_for_disaggregation(self, output_datastore, sample_period, measurement, timeframes, building, meters=None, num_meters=None, supervised=True, original_building_meta=None, rest_powerflow_included=False): """Add metadata for disaggregated appliance estimates to datastore. REMINDER: Also urpruenglich wollte ich das anders machen und eben auch die Metadatan mit abspeichern. Habe ich aus zeitgruenden dann gelassen und mache es doch so wie es vorher war. This function first checks whether there are already metainformation in the file. If zes, it extends them and otherwise it removes them. Note that `self.MODEL_NAME` needs to be set to a string before calling this method. For example, we use `self.MODEL_NAME = 'CO'` for Combinatorial Optimisation. TODO:`preprocessing_applied` for all meters TODO: submeter measurement should probably be the mains measurement we used to train on, not the mains measurement. Parameters ---------- output_datastore : nilmtk.DataStore subclass object The datastore to write metadata into. sample_period : int The sample period, in seconds, used for both the mains and the disaggregated appliance estimates. measurement : 2-tuple of strings In the form (<physical_quantity>, <type>) e.g. ("power", "active") timeframes : list of nilmtk.TimeFrames or nilmtk.TimeFrameGroup The TimeFrames over which this data is valid for. building : int The building instance number (starting from 1) supervised : bool, defaults to True Is this a supervised NILM algorithm? meters : list of nilmtk.ElecMeters, optional Required if `supervised=True` num_meters : [int] Required if `supervised=False`, Gives for each phase amount of meters """ # DataSet and MeterDevice metadata only when not already available try: metadata = output_datastore.load_metadata() timeframes.append( TimeFrame(start=metadata["timeframe"]["start"], end=metadata["timeframe"]["end"])) total_timeframe = TimeFrameGroup(timeframes).get_timeframe() dataset_metadata = { 'name': metadata["name"], 'date': metadata["date"], 'meter_devices': metadata["meter_devices"], 'timeframe': total_timeframe.to_dict() } output_datastore.save_metadata('/', dataset_metadata) except: pq = 3 meter_devices = { 'disaggregate': { 'model': type(self), #self.model.MODEL_NAME, 'sample_period': sample_period if rest_powerflow_included else 0, # Makes it possible to use special load functionality 'max_sample_period': sample_period, 'measurements': [{ 'physical_quantity': 'power', #measurement.levels[0][0], 'type': 'active' #measurement.levels, #[1][0] }] } } if rest_powerflow_included: meter_devices['rest'] = { 'model': 'rest', 'sample_period': sample_period, 'max_sample_period': sample_period, 'measurements': [{ 'physical_quantity': 'power', #measurement.levels, #[0][0], 'type': 'active' #measurement.levels, #[1][0] }] } total_timeframe = TimeFrameGroup(timeframes).get_timeframe() date_now = datetime.now().isoformat().split('.')[0] dataset_metadata = { 'name': type(self), 'date': date_now, 'meter_devices': meter_devices, 'timeframe': total_timeframe.to_dict() } output_datastore.save_metadata('/', dataset_metadata) # Building metadata always stored for the new buildings for i in range(len(num_meters)): phase_building = building * 10 + i building_path = '/building{}'.format(phase_building) mains_data_location = building_path + '/elec/meter1' # Rest meter: elec_meters = {} if rest_powerflow_included: elec_meters[1] = { 'device_model': 'rest', #'site_meter': True, 'data_location': mains_data_location, 'preprocessing_applied': {}, # TODO 'statistics': { 'timeframe': total_timeframe.to_dict() } } def update_elec_meters(meter_instance): elec_meters.update({ meter_instance: { 'device_model': 'disaggregate', # self.MODEL_NAME, 'submeter_of': 1, 'data_location': ('{}/elec/meter{}'.format(building_path, meter_instance)), 'preprocessing_applied': {}, # TODO 'statistics': { 'timeframe': total_timeframe.to_dict() } } }) # Appliances and submeters: appliances = [] if supervised: for meter in meters: meter_instance = meter.instance() update_elec_meters(meter_instance) for app in meter.appliances: appliance = { 'meters': [meter_instance], 'type': app.identifier.type, 'instance': app.identifier.instance # TODO this `instance` will only be correct when the # model is trained on the same house as it is tested on # https://github.com/nilmtk/nilmtk/issues/194 } appliances.append(appliance) # Setting the name if it exists if meter.name: if len(meter.name) > 0: elec_meters[meter_instance]['name'] = meter.name else: # Unsupervised # Submeters: # Starts at 2 because meter 1 is mains. for chan in range(2, num_meters[i] + 1): # Additional + 1 because index 0 skipped update_elec_meters(meter_instance=chan) appliance = { 'meters': [chan], 'type': 'unknown', 'instance': chan - 1 # TODO this `instance` will only be correct when the # model is trained on the same house as it is tested on # https://github.com/nilmtk/nilmtk/issues/194 } appliances.append(appliance) if len(appliances) == 0: continue building_metadata = { 'instance': (phase_building), 'elec_meters': elec_meters, 'appliances': appliances, 'original_name': original_building_meta['original_name'] if 'original_name' in original_building_meta else None, 'geo_location': original_building_meta['geo_location'] if 'geo_location' in original_building_meta else None, 'zip': original_building_meta['zip'] if 'zip' in original_building_meta else None, } print(building_path) output_datastore.save_metadata(building_path, building_metadata)
def load(self, key, columns=None, sections=None, n_look_ahead_rows=0, chunksize=MAX_MEM_ALLOWANCE_IN_BYTES, verbose=False, **additionalLoaderKwargs): # TODO: calculate chunksize default based on physical # memory installed and number of columns # Make sure key has a slash at the front but not at the end. if key[0] != '/': key = '/' + key if len(key) > 1 and key[-1] == '/': key = key[:-1] # Make sure chunksize is an int otherwise `range` complains later. chunksize = np.int64(chunksize) # Set `sections` variable sections = [TimeFrame()] if sections is None else sections sections = TimeFrameGroup(sections) # Replace any Nones with '' in cols: if columns is not None: columns = [('' if pq is None else pq, '' if ac is None else ac) for pq, ac in columns] cols_idx = pd.MultiIndex.from_tuples( columns, names=['physical_quantity', 'type']) if verbose: print("HDFDataStore.load(key='{}', columns='{}', sections='{}'," " n_look_ahead_rows='{}', chunksize='{}')".format( key, columns, sections, n_look_ahead_rows, chunksize)) self.all_sections_smaller_than_chunksize = True for section in sections: if verbose: print(" ", section) window_intersect = self.window.intersection(section) if window_intersect.empty: # Wenn der abgefragte Zeitabschnitt nicht in der Datenreihe enthalten ist data = pd.DataFrame(columns=cols_idx) data.timeframe = section yield data continue terms = window_intersect.query_terms('window_intersect') if terms is None: section_start_i = 0 section_end_i = self.store.get_storer(key).nrows if section_end_i <= 1: data = pd.DataFrame(columns=cols_idx) data.timeframe = section yield data continue else: try: coords = self.store.select_as_coordinates(key=key, where=terms) except AttributeError as e: if str(e) == ("'NoneType' object has no attribute " "'read_coordinates'"): raise KeyError("key '{}' not found".format(key)) else: raise n_coords = len(coords) if n_coords == 0: data = pd.DataFrame(columns=cols_idx) data.timeframe = window_intersect yield data continue section_start_i = coords[0] section_end_i = coords[-1] if section_start_i == section_end_i: # For corner cases where there is really only a single entry. section_end_i += 1 del coords slice_starts = range(section_start_i, section_end_i, chunksize) n_chunks = int( np.ceil((section_end_i - section_start_i) / chunksize)) if n_chunks > 1: self.all_sections_smaller_than_chunksize = False for chunk_i, chunk_start_i in enumerate(slice_starts): chunk_end_i = chunk_start_i + chunksize there_are_more_subchunks = (chunk_i < n_chunks - 1) if chunk_end_i > section_end_i: chunk_end_i = section_end_i chunk_end_i += 1 data = self.store.select(key=key, columns=cols_idx, start=chunk_start_i, stop=chunk_end_i) if len(data) <= 2: data = pd.DataFrame(columns=cols_idx) data.timeframe = section yield data # Load look ahead if necessary if n_look_ahead_rows > 0: if len(data.index) > 0: look_ahead_start_i = chunk_end_i look_ahead_end_i = look_ahead_start_i + n_look_ahead_rows try: data.look_ahead = self.store.select( key=key, columns=columns, start=look_ahead_start_i, stop=look_ahead_end_i) except ValueError: data.look_ahead = pd.DataFrame() else: data.look_ahead = pd.DataFrame() data.timeframe = _timeframe_for_chunk(there_are_more_subchunks, chunk_i, window_intersect, data.index) yield data del data
def _get_stat_from_cache_or_compute(self, nodes, results_obj, loader_kwargs): """General function for computing statistics and/or loading them from cache. Cached statistics lives in the DataStore at 'building<I>/elec/cache/meter<K>/<statistic_name>' e.g. 'building1/elec/cache/meter1/total_energy'. We store the 'full' statistic... i.e we store a representation of the `Results._data` DataFrame. Some times we need to do some conversion to store `Results._data` on disk. The logic for doing this conversion lives in the `Results` class or subclass. The cache can be cleared by calling `ElecMeter.clear_cache()`. Parameters ---------- nodes : list of nilmtk.Node classes results_obj : instance of nilmtk.Results subclass loader_kwargs : dict Returns ------- if `full_results` is True then return nilmtk.Results subclass instance otherwise return nilmtk.Results.simple(). See Also -------- clear_cache _compute_stat key_for_cached_stat get_cached_stat """ full_results = loader_kwargs.pop('full_results', False) verbose = loader_kwargs.get('verbose') if 'ac_type' in loader_kwargs or 'physical_quantity' in loader_kwargs: loader_kwargs = self._convert_physical_quantity_and_ac_type_to_cols( **loader_kwargs) columns = loader_kwargs.get('columns', []) ac_types = set([m[1] for m in columns if m[1]]) results_obj_copy = deepcopy(results_obj) # Prepare `sections` list sections = loader_kwargs.get('sections') if sections is None: tf = self.get_timeframe() tf.include_end = True sections = [tf] sections = TimeFrameGroup(sections) sections = [s for s in sections if not s.empty] # Retrieve usable stats from cache key_for_cached_stat = self.key_for_cached_stat(results_obj.name) if loader_kwargs.get('preprocessing') is None: cached_stat = self.get_cached_stat(key_for_cached_stat) results_obj.import_from_cache(cached_stat, sections) def find_sections_to_compute(): # Get sections_to_compute results_obj_timeframes = results_obj.timeframes() sections_to_compute = set(sections) - set( results_obj_timeframes) sections_to_compute = sorted(sections_to_compute) return sections_to_compute try: ac_type_keys = results_obj.simple().keys() except: sections_to_compute = find_sections_to_compute() else: if ac_types.issubset(ac_type_keys): sections_to_compute = find_sections_to_compute() else: sections_to_compute = sections results_obj = results_obj_copy else: sections_to_compute = sections if verbose and not results_obj._data.empty: print("Using cached result.") # If we get to here then we have to compute some stats if sections_to_compute: loader_kwargs['sections'] = sections_to_compute computed_result = self._compute_stat(nodes, loader_kwargs) # Merge cached results with newly computed results_obj.update(computed_result.results) # Save to disk newly computed stats stat_for_store = computed_result.results.export_to_cache() try: self.store.append(key_for_cached_stat, stat_for_store) except ValueError: # the old table probably had different columns self.store.remove(key_for_cached_stat) self.store.put(key_for_cached_stat, results_obj.export_to_cache()) if full_results: return results_obj else: res = results_obj.simple() if ac_types: try: ac_type_keys = res.keys() except: return res else: if res.empty: return res else: return pd.Series(res[ac_types], index=ac_types) else: return res
def load(self, key, columns=None, sections=None, n_look_ahead_rows=0, chunksize=MAX_MEM_ALLOWANCE_IN_BYTES, verbose=False, **additionalLoaderKwargs): ''' Load measurments over a certain period of time. The resampling is taking place on the serverside to save bandwidth. This is different from the HDF datastore where always all data is loaded and then resampled. ''' # TODO: calculate chunksize default based on physical # memory installed and number of columns # Make sure key has a slash at the front but not at the end. if key[0] != '/': key = '/' + key if len(key) > 1 and key[-1] == '/': key = key[:-1] sample_period = additionalLoaderKwargs["sample_period"] # Make sure chunksize is an int otherwise `range` complains later. chunksize = np.int64(chunksize) # Set `sections` variable sections = [TimeFrame()] if sections is None else sections sections = TimeFrameGroup(sections) # Replace any Nones with '' in cols: if columns is not None: columns = [('' if pq is None else pq, '' if ac is None else ac) for pq, ac in columns] cols_idx = pd.MultiIndex.from_tuples( columns, names=['physical_quantity', 'type']) columnsStr = [] for i, val in enumerate(columns): columnsStr.append(str(columns[i])) columnsStr = str(columnsStr) if verbose: print("HDFDataStore.load(key='{}', columns='{}', sections='{}'," " n_look_ahead_rows='{}', chunksize='{}')".format( key, columns, sections, n_look_ahead_rows, chunksize)) self.all_sections_smaller_than_chunksize = True for section in sections: if verbose: print(" ", section) window_intersect = self.window.intersection(section) if window_intersect.empty: # Wenn der abgefragte Zeitabschnitt nicht in der Datenreihe enthalten ist data = pd.DataFrame(columns=cols_idx) data.timeframe = section yield data continue # The estimation of fitting slices is avoided delta = section.end - section.start n_chunks = int( np.ceil((delta.total_seconds() / sample_period / chunksize))) delta = delta / n_chunks slice_starts = [] for i in range(n_chunks): slice_starts.append(section.start + delta * i) if n_chunks > 1: self.all_sections_smaller_than_chunksize = False # Load the sections for chunk_i, chunk_start_i in enumerate(slice_starts): chunk_end_i = chunk_start_i + datetime.timedelta( seconds=int(chunksize)) there_are_more_subchunks = (chunk_i < n_chunks - 1) if chunk_end_i > section.end: chunk_end_i = section.end # The required parameter form is: base={lat}/{lng}/{deviceKey}/{deviceType} + {start}/{end}/{columns}/{sample_rate} iso_chunk_start = chunk_start_i.isoformat() iso_chunk_end = chunk_end_i.isoformat() data = self._execute_request("load", type="GET", parameters={ "url": key, "start": iso_chunk_start, "end": iso_chunk_end, "columns": columnsStr, "sample_period": str(sample_period) }) data = self._jsonDataToPandasDF(columns, data) if len(data) <= 2: data = pd.DataFrame(columns=cols_idx) data.timeframe = section yield data # Load look ahead if necessary if n_look_ahead_rows > 0: if len(data.index) > 0: look_ahead_start_i = chunk_end_i look_ahead_end_i = look_ahead_start_i + n_look_ahead_rows try: #data.look_ahead = self.store.select( # key=key, columns=columns, # start=look_ahead_start_i, # stop=look_ahead_end_i) data = self._execute_request( "load", type="GET", parameters={ "url": key, "start": look_ahead_start_i, "end": look_ahead_end_i, "columns": columnsStr, "sample_period": sample_period }) data = self._jsonDataToPandasDF(columns, data) except ValueError: data.look_ahead = pd.DataFrame() else: data.look_ahead = pd.DataFrame() data.timeframe = _timeframe_for_chunk(there_are_more_subchunks, chunk_i, window_intersect, data.index) yield data del data
def load(self, key, columns=None, sections=None, n_look_ahead_rows=0, chunksize=MAX_MEM_ALLOWANCE_IN_BYTES, verbose=False): # TODO: calculate chunksize default based on physical # memory installed and number of columns # Make sure key has a slash at the front but not at the end. if key[0] != '/': key = '/' + key if len(key) > 1 and key[-1] == '/': key = key[:-1] # Make sure chunksize is an int otherwise `range` complains later. chunksize = np.int64(chunksize) # Set `sections` variable sections = [TimeFrame()] if sections is None else sections sections = TimeFrameGroup(sections) # Replace any Nones with '' in columns: if columns is not None: columns = [('' if pq is None else pq, '' if ac is None else ac) for pq, ac in columns] if verbose: print("HDFDataStore.load(key='{}', columns='{}', sections='{}'," " n_look_ahead_rows='{}', chunksize='{}')".format( key, columns, sections, n_look_ahead_rows, chunksize)) self.all_sections_smaller_than_chunksize = True for section in sections: if verbose: print(" ", section) window_intersect = self.window.intersection(section) if window_intersect.empty: data = pd.DataFrame() data.timeframe = section yield data continue terms = window_intersect.query_terms('window_intersect') if terms is None: section_start_i = 0 section_end_i = self.store.get_storer(key).nrows if section_end_i <= 1: data = pd.DataFrame() data.timeframe = section yield data continue else: try: coords = self.store.select_as_coordinates(key=key, where=terms) except AttributeError as e: if str(e) == ("'NoneType' object has no attribute " "'read_coordinates'"): raise KeyError("key '{}' not found".format(key)) else: raise n_coords = len(coords) if n_coords == 0: data = pd.DataFrame() data.timeframe = window_intersect yield data continue section_start_i = coords[0] section_end_i = coords[-1] del coords slice_starts = range(section_start_i, section_end_i, chunksize) n_chunks = int( np.ceil((section_end_i - section_start_i) / chunksize)) if n_chunks > 1: self.all_sections_smaller_than_chunksize = False for chunk_i, chunk_start_i in enumerate(slice_starts): chunk_end_i = chunk_start_i + chunksize there_are_more_subchunks = (chunk_i < n_chunks - 1) if chunk_end_i > section_end_i: chunk_end_i = section_end_i chunk_end_i += 1 data = self.store.select(key=key, columns=columns, start=chunk_start_i, stop=chunk_end_i) # if len(data) <= 2: # yield pd.DataFrame() # Load look ahead if necessary if n_look_ahead_rows > 0: if len(data.index) > 0: look_ahead_start_i = chunk_end_i look_ahead_end_i = look_ahead_start_i + n_look_ahead_rows try: look_ahead = self.store.select( key=key, columns=columns, start=look_ahead_start_i, stop=look_ahead_end_i) except ValueError: look_ahead = pd.DataFrame() else: look_ahead = pd.DataFrame() with warnings.catch_warnings(): # Silence "Pandas doesn't allow columns to be created via a new attribute name" # since we're not adding a column warnings.filterwarnings( 'ignore', category=UserWarning, message=".*Pandas doesn't allow columns.*") setattr(data, 'look_ahead', look_ahead) data.timeframe = _timeframe_for_chunk(there_are_more_subchunks, chunk_i, window_intersect, data.index) yield data del data
def _classify_activation_quality(self, nilmtk_activations): def get_stale_seconds(act): actdiff = act.resample("{:d}S".format(self.sample_period)).mean().ffill().diff() return (actdiff == 0.0).sum() * self.sample_period def activation_filter(tf, building_data): start_time = tf.start end_time = tf.end df = building_data[start_time:end_time] if df.empty: return False else: act_stale_seconds = get_stale_seconds(df['target']) act_duration = (end_time - start_time).total_seconds() act_stale_pct = act_stale_seconds / act_duration mains_stale_seconds = get_stale_seconds(df['mains']) mains_stale_pct = get_stale_seconds(df['mains']) / act_duration if (act_stale_pct < self.activation_max_stale_pct) & (mains_stale_pct < self.mains_max_stale_pct): return True else: return False good_timeframes = {} bad_timeframes = {} all_timeframes = {} for fold, buildings_per_appliances in nilmtk_activations.items(): good_timeframes[fold] = {} bad_timeframes[fold] = {} all_timeframes[fold] = {} for appliance, activations_per_building in buildings_per_appliances.items(): good_timeframes[fold][appliance] = {} bad_timeframes[fold][appliance] = {} all_timeframes[fold][appliance] = {} for building, activations in activations_per_building.items(): building_data = self.data[fold][building] good_timeframes_per_building = TimeFrameGroup() bad_timeframes_per_building = TimeFrameGroup() all_timeframes_per_building = TimeFrameGroup() for i, activation in enumerate(activations): tf = TimeFrame( start=activation.index[0], end=activation.index[-1] + pd.Timedelta(seconds=self.sample_period)) all_timeframes_per_building.append(tf) if activation_filter(tf, building_data): good_timeframes_per_building.append(tf) else: bad_timeframes_per_building.append(tf) good_timeframes[fold][appliance][building] = good_timeframes_per_building bad_timeframes[fold][appliance][building] = bad_timeframes_per_building all_timeframes[fold][appliance][building] = all_timeframes_per_building # self.clean_active_timeframes = good_timeframes self.all_active_timeframes = all_timeframes self.phony_active_timeframes = bad_timeframes