def _find_sections_with_no_target(self):
     """Finds the intersections of the mains good sections with the gaps
     between target appliance activations.
     """
     self.sections_with_no_target = {}
     seq_length_secs = self.seq_length * self.sample_period
     for fold, sects_per_building in self.mains_good_sections.items():
         for building, good_sections in sects_per_building.items():
             activations = (
                 self.activations[fold][self.target_appliance][building])
             mains = self.mains[fold][building]
             mains_good_sections = self.mains_good_sections[fold][building]
             gaps_between_activations = TimeFrameGroup()
             prev_end = mains.index[0]
             for activation in activations:
                 gap = TimeFrame(prev_end, activation.index[0])
                 gaps_between_activations.append(gap)
                 prev_end = activation.index[-1]
             gap = TimeFrame(prev_end, mains.index[-1])
             gaps_between_activations.append(gap)
             intersection = (
                 gaps_between_activations.intersection(mains_good_sections))
             intersection = intersection.remove_shorter_than(
                 seq_length_secs)
             self.sections_with_no_target.setdefault(
                 fold, {})[building] = (intersection)
             logger.info(
                 "Found {} sections without target for {} {}.".format(
                     len(intersection), fold, building))
Пример #2
0
    def _delete_phony_sections(self):
        filtered_data = {}
        for fold, data_per_building in self.data.items():
            for building, data in data_per_building.items():
                if building not in self.phony_active_timeframes[fold][self.target_appliance]:
                    continue
                activations = (
                    self.phony_active_timeframes[fold][self.target_appliance][building])
                data_between_phony_activations = TimeFrameGroup()
                prev_end = data.index[0]
                for activation in activations:
                    activation_start = activation.start
                    if prev_end < activation_start:
                        gap = TimeFrame(prev_end, activation_start)
                        data_between_phony_activations.append(gap)
                    prev_end = activation.end
                data_end = data.index[-1] + pd.Timedelta(seconds=self.sample_period)
                if prev_end < data_end:
                    gap = TimeFrame(prev_end, data_end)
                    data_between_phony_activations.append(gap)
                dfs = []
                for section in data_between_phony_activations:
                    dfs.append(section.slice(data))
                data = pd.concat(dfs)
                filtered_data.setdefault(fold, {})[building] = (
                    data)
                logger.info("Found {} good sections for {} {}."
                            .format(len(data_between_phony_activations), fold, building))

        self.data = filtered_data
Пример #3
0
 def _find_sections_with_no_target(self):
     """Finds the intersections of the mains good sections with the gaps
     between target appliance activations.
     """
     self.sections_with_no_target = {}
     seq_length_secs = self.seq_length * self.sample_period
     for fold, sects_per_building in self.data_good_sections.items():
         for building, good_sections in sects_per_building.items():
             if building not in self.all_activations[fold][self.target_appliance]:
                 continue
             activations = (
                 self.all_activations[fold][self.target_appliance][building])
             data = self.data[fold][building]
             data_good_sections = good_sections
             gaps_between_activations = TimeFrameGroup()
             prev_end = data.index[0]
             for activation in activations:
                 activation_start = activation.start
                 if prev_end < activation_start:
                     gap = TimeFrame(prev_end, activation_start)
                     gaps_between_activations.append(gap)
                 prev_end = activation.end
             data_end = data.index[-1]
             if prev_end < data_end:
                 gap = TimeFrame(prev_end, data_end)
             gaps_between_activations.append(gap)
             intersection = (
                 gaps_between_activations.intersection(data_good_sections))
             intersection = intersection.remove_shorter_than(
                 seq_length_secs)
             self.sections_with_no_target.setdefault(fold, {})[building] = (
                 intersection)
             logger.info("Found {} sections without target for {} {}."
                         .format(len(intersection), fold, building))
Пример #4
0
 def import_from_cache(self, cached_stat, sections):
     # we (deliberately) use duplicate indices to cache GoodSectionResults
     grouped_by_index = cached_stat.groupby(level=0)
     tz = get_tz(cached_stat)
     for tf_start, df_grouped_by_index in grouped_by_index:
         grouped_by_end = df_grouped_by_index.groupby('end')
         for tf_end, sections_df in grouped_by_end:
             end = tz_localize_naive(tf_end, tz)
             timeframe = TimeFrame(tf_start, end)
             if timeframe in sections:
                 timeframes = []
                 for _, row in sections_df.iterrows():
                     #~ print('Computing sections...')
                     #~ print('\n=== type(row)\n%s\n' % (type(row)))
                     #~ print('\n=== row\n%s\n' % (row))
                     #~ print('\n=== dir(row)\n%s\n' % (dir(row)))
                     #~ print('\n=== row.__dict__\n%s\n' % (row.__dict__))
                     #~ print('\n=== row.index\n%s\n' % (row.index))
                     #~ print('\n=== row.index.__dict__\n%s\n' % (row.index.__dict__))
                     #~ print('\n=== dir(row.index)\n%s\n' % (dir(row.index)))
                     #~ print('\n=== row.index[2]\n%s\n' % (row.index[2]))
                     #~ print('\n=== row.iloc[1]\n%s\n' % (row.iloc[1]))
                     #~ print('\n=== row.iloc[2]\n%s\n' % (row.iloc[2]))
                     section_start = tz_localize_naive(
                         row.iloc[2], tz)  # row['section_start']
                     section_end = tz_localize_naive(
                         row.iloc[1], tz)  # row['section_end']
                     timeframes.append(TimeFrame(section_start,
                                                 section_end))
                 self.append(timeframe, {'sections': [timeframes]})
Пример #5
0
    def load(self, key, columns=None, sections=None, n_look_ahead_rows=0,
             chunksize=MAX_MEM_ALLOWANCE_IN_BYTES):
             
        file_path = self._key_to_abs_path(key)
        
        # Set `sections` variable
        sections = [TimeFrame()] if sections is None else sections
        sections = TimeFrameGroup(sections)

        self.all_sections_smaller_than_chunksize = True
        
        # iterate through parameter sections
        # requires 1 pass through file for each section
        for section in sections:
            window_intersect = self.window.intersection(section)
            header_rows = [0,1]
            text_file_reader = pd.read_csv(file_path, 
                                            index_col=0, 
                                            header=header_rows, 
                                            parse_dates=True,
                                            chunksize=chunksize)
                                            
            # iterate through all chunks in file
            for chunk_idx, chunk in enumerate(text_file_reader):
                
                # filter dataframe by specified columns
                if columns:
                    chunk = chunk[columns]
                
                # mask chunk by window and section intersect
                subchunk_idx = [True]*len(chunk)
                if window_intersect.start:
                    subchunk_idx = np.logical_and(subchunk_idx, (chunk.index>=window_intersect.start))
                if window_intersect.end:
                    subchunk_idx = np.logical_and(subchunk_idx, (chunk.index<window_intersect.end))
                if window_intersect.empty:
                    subchunk_idx = [False]*len(chunk)
                subchunk = chunk[subchunk_idx]
                
                if len(subchunk)>0:
                    subchunk_end = np.max(np.nonzero(subchunk_idx))
                    subchunk.timeframe = TimeFrame(subchunk.index[0], subchunk.index[-1])
                    # Load look ahead if necessary
                    if n_look_ahead_rows > 0:
                        if len(subchunk.index) > 0:
                            rows_to_skip = (len(header_rows)+1)+(chunk_idx*chunksize)+subchunk_end+1
                            try:
                                subchunk.look_ahead = pd.read_csv(file_path, 
                                                index_col=0, 
                                                header=None, 
                                                parse_dates=True,
                                                skiprows=rows_to_skip,
                                                nrows=n_look_ahead_rows)
                            except ValueError:
                                subchunk.look_ahead = pd.DataFrame()
                        else:
                            subchunk.look_ahead = pd.DataFrame()
                    
                    yield subchunk
Пример #6
0
 def test_u(ts1, ts2, ts3, ts4):
     ts1 = pd.Timestamp(ts1)
     ts2 = pd.Timestamp(ts2)
     ts3 = pd.Timestamp(ts3)
     ts4 = pd.Timestamp(ts4)
     tf1 = TimeFrame(ts1, ts2)
     tf2 = TimeFrame(ts3, ts4)
     union = tf1.union(tf2)
     self.assertEqual(union.start, ts1)
     self.assertEqual(union.end, ts4)
Пример #7
0
 def test_u(ts1, ts2, ts3, ts4):
     ts1 = pd.Timestamp(ts1)
     ts2 = pd.Timestamp(ts2)
     ts3 = pd.Timestamp(ts3)
     ts4 = pd.Timestamp(ts4)
     tf1 = TimeFrame(ts1, ts2)
     tf2 = TimeFrame(ts3, ts4)
     union = tf1.union(tf2)
     self.assertEqual(union.start, ts1)
     self.assertEqual(union.end, ts4)
Пример #8
0
    def get_timeframe(self):
        ''' Returns the timeframe from start of first section to end of last section.

        Returns:
            timeframe: outer timeframe of this TimeFrameGroup
        '''

        if self._df.empty:
            return TimeFrame(start=None, end=None)

        idx = self._df.index
        return TimeFrame(start=self._df.loc[idx[0], 'section_start'],
                         end=self._df.loc[idx[-1], 'section_end'])
Пример #9
0
    def test_merge_timeframes(self):
        tfs = [
            TimeFrame("2010-01-01", "2011-01-01"),
            TimeFrame("2011-01-01", "2011-06-01"),
            TimeFrame("2012-01-01", "2013-01-01")
        ]

        merged = merge_timeframes(tfs)
        correct_answer = [
            TimeFrame("2010-01-01", "2011-06-01"),
            TimeFrame("2012-01-01", "2013-01-01")
        ]
        self.assertEqual(merged, correct_answer)
Пример #10
0
 def __init__(self, timeframes=None):
     if isinstance(timeframes, pd.tseries.period.PeriodIndex):
         periods = timeframes
         timeframes = [TimeFrame(period.start_time, period.end_time)
                       for period in periods]
     args = [timeframes] if timeframes else []
     super(TimeFrameGroup, self).__init__(*args)
Пример #11
0
    def import_from_cache(self, cached_stat, sections):
        # we (deliberately) use duplicate indices to cache GoodSectionResults
        grouped_by_index = cached_stat.groupby(level=0)
        tz = get_tz(cached_stat)

        for name, group in grouped_by_index:
            assert group['end'].unique().size == 1
            end = tz_localize_naive(group['end'].iloc[0], tz)
            timeframe = TimeFrame(name, end)
            if timeframe in sections:
                timeframes = []
                for _, row in group.iterrows():
                    section_start = tz_localize_naive(row['section_start'], tz)
                    section_end = tz_localize_naive(row['section_end'], tz)
                    timeframes.append(TimeFrame(section_start, section_end))
                self.append(timeframe, {'sections': [timeframes]})
Пример #12
0
 def __init__(self):
     """
     Parameters
     ----------
     filename : string
     """
     self.window = TimeFrame()
Пример #13
0
    def _add_external_data(self,
                           chunk,
                           ext_dataset,
                           external_features,
                           horizon=None):
        '''
        Currently coming from 820 (for all the meters I do consider)
    
        Paramters
        ---------
        chunk: pd.DataFrame, pd.DatetimeIndex
            The input which have to be augmented 
        ext_dataset: nilmtk.Dataset
            The Dataset, where the external Data can be found.
        horizon: nilmtk.Timedelta
            The timeframe in the future for which external data shall be 
            retrieved. This will be outside the chunk area sothat it is extended.
            Necessary for forecast with external data included.
        external_features: [indixes,... ]
            The indexes which shall be retrieved.

        Returns
        -------
        chunk: pd.DataFrame
            The input chunk extended by the features given in 
            external_features.
        '''
        if not horizon is None and not type(horizon) is pd.Timedelta:
            raise Exception("Horizon has to be a DatetimeDelta")
        if not external_features is None and not type(
                external_features) is list:
            external_features = [external_features]

        # Sothat index is also supported
        if type(chunk) is pd.DatetimeIndex:
            chunk = pd.DataFrame(index=chunk)

        extData = None
        if len(external_features) > 0:
            section = TimeFrame(start=chunk.index[0], end=chunk.index[-1])
            if not horizon is None:
                section.end = section.end + horizon
            extData = ext_dataset.get_data_for_group('820', section, 60 * 15,
                                                     external_features)[1:]

        return pd.concat([chunk, extData], axis=1)
Пример #14
0
 def __iter__(self):
     ''' Enabled an iterator to iterate the TimeframeGroup
     '''
     if len(self._df) == 0:
         return iter([])
     else:
         for i, row in self._df.iterrows():
             yield TimeFrame(start=row['section_start'],
                             end=row['section_end'])
Пример #15
0
 def import_from_cache(self, cached_stat, sections):
     # we (deliberately) use duplicate indices to cache GoodSectionResults
     grouped_by_index = cached_stat.groupby(level=0)
     tz = get_tz(cached_stat)
     for tf_start, df_grouped_by_index in grouped_by_index:
         grouped_by_end = df_grouped_by_index.groupby('end')
         for tf_end, sections_df in grouped_by_end:
             end = tz_localize_naive(tf_end, tz)
             timeframe = TimeFrame(tf_start, end)
             if timeframe in sections:
                 timeframes = []
                 for _, row in sections_df.iterrows():
                     section_start = tz_localize_naive(
                         row['section_start'], tz)
                     section_end = tz_localize_naive(row['section_end'], tz)
                     timeframes.append(TimeFrame(section_start,
                                                 section_end))
                 self.append(timeframe, {'sections': [timeframes]})
Пример #16
0
 def get_timeframe(self, key):
     """
     Returns
     -------
     nilmtk.TimeFrame of entire table after intersecting with self.window.
     """
     data_start_date = self.store.select(key, [0]).index[0]
     data_end_date = self.store.select(key, start=-1).index[0]
     timeframe = TimeFrame(data_start_date, data_end_date)
     return self.window.intersection(timeframe)
Пример #17
0
    def test_date_setting(self):
        TimeFrame()
        TimeFrame("2012-01-01", "2013-01-01")

        # test identical start and end dates
        with self.assertRaises(ValueError):
            TimeFrame("2012-01-01", "2012-01-01")

        TimeFrame(start="2011-01-01")
        TimeFrame(end="2011-01-01")

        # test end date after start date
        with self.assertRaises(ValueError):
            TimeFrame("2012-01-01", "2011-01-01")

        tf = TimeFrame()
        tf.end = "2011-01-01"
        tf.start = "2010-01-01"
        with self.assertRaises(ValueError):
            tf.start = "2012-01-01"
Пример #18
0
    def _classify_activation_quality(self, nilmtk_activations):
        def get_stale_seconds(act):
            actdiff = act.resample("{:d}S".format(self.sample_period)).mean().ffill().diff()
            return (actdiff == 0.0).sum() * self.sample_period

        def activation_filter(tf, building_data):
            start_time = tf.start
            end_time = tf.end
            df = building_data[start_time:end_time]
            if df.empty:
                return False
            else:
                act_stale_seconds = get_stale_seconds(df['target'])
                act_duration = (end_time - start_time).total_seconds()
                act_stale_pct = act_stale_seconds / act_duration
                mains_stale_seconds = get_stale_seconds(df['mains'])
                mains_stale_pct = get_stale_seconds(df['mains']) / act_duration
                if (act_stale_pct < self.activation_max_stale_pct) & (mains_stale_pct < self.mains_max_stale_pct):
                    return True
                else:
                    return False

        good_timeframes = {}
        bad_timeframes = {}
        all_timeframes = {}
        for fold, buildings_per_appliances in nilmtk_activations.items():
            good_timeframes[fold] = {}
            bad_timeframes[fold] = {}
            all_timeframes[fold] = {}
            for appliance, activations_per_building in buildings_per_appliances.items():
                good_timeframes[fold][appliance] = {}
                bad_timeframes[fold][appliance] = {}
                all_timeframes[fold][appliance] = {}
                for building, activations in activations_per_building.items():
                    building_data = self.data[fold][building]
                    good_timeframes_per_building = TimeFrameGroup()
                    bad_timeframes_per_building = TimeFrameGroup()
                    all_timeframes_per_building = TimeFrameGroup()
                    for i, activation in enumerate(activations):
                        tf = TimeFrame(
                            start=activation.index[0],
                            end=activation.index[-1] + pd.Timedelta(seconds=self.sample_period))
                        all_timeframes_per_building.append(tf)
                        if activation_filter(tf, building_data):
                            good_timeframes_per_building.append(tf)
                        else:
                            bad_timeframes_per_building.append(tf)
                    good_timeframes[fold][appliance][building] = good_timeframes_per_building
                    bad_timeframes[fold][appliance][building] = bad_timeframes_per_building
                    all_timeframes[fold][appliance][building] = all_timeframes_per_building
        #
        self.clean_active_timeframes = good_timeframes
        self.all_active_timeframes = all_timeframes
        self.phony_active_timeframes = bad_timeframes
Пример #19
0
 def pop(self, i):
     ''' Pops a certain TimeFrame from the TimeFrameGroup
     The TimeFrame at position i is removed from the Group and returned
     
     Paramters
     ---------
     i: int
         The location of the event to remove.
     '''
     if i is None:
         i = -1
     last = self._df.iloc[i, :]
     self._df.drop(self._df.index[i], inplace=True)
     return TimeFrame(last['section_start'], last['section_end'])
Пример #20
0
 def import_from_cache(self, cached_stat, sections):
     '''
     As explained in 'export_to_cache' the sections have to be stored 
     rowwise. This function parses the lines and rearranges them as a 
     proper AboveFreqSectionsResult again.
     '''
     # we (deliberately) use duplicate indices to cache AboveFreqSectionResults
     grouped_by_index = cached_stat.groupby(level=0)
     tz = get_tz(cached_stat)
     for tf_start, df_grouped_by_index in grouped_by_index:
         grouped_by_end = df_grouped_by_index.groupby('end')
         for tf_end, sections_df in grouped_by_end:
             end = tz_localize_naive(tf_end, tz)
             timeframe = TimeFrame(tf_start, end)
             if timeframe in sections:
                 timeframes = []
                 for _, row in sections_df.iterrows():
                     section_start = tz_localize_naive(
                         row['section_start'], tz)
                     section_end = tz_localize_naive(row['section_end'], tz)
                     timeframes.append(TimeFrame(section_start,
                                                 section_end))
                 self.append(timeframe, {'sections': [timeframes]})
Пример #21
0
    def __getitem__(self, i):
        ''' Enabled to access TimeFrameGroup as a list

        Parameters
        ----------
        i:  int
            Position to return

        Results
        -------
        elements: nilmtk.TimeFrame
            The element at position i
        '''
        elements = self._df.iloc[i, :]
        return TimeFrame(elements['section_start'], elements['section_end'])
Пример #22
0
    def get_timeframe(self, key):

        file_path = self._key_to_abs_path(key)
        text_file_reader = pd.read_csv(file_path,
                                       index_col=0,
                                       header=[0, 1],
                                       parse_dates=True,
                                       chunksize=MAX_MEM_ALLOWANCE_IN_BYTES)
        start = None
        end = None
        for df in text_file_reader:
            if start is None:
                start = df.index[0]
            end = df.index[-1]
        timeframe = TimeFrame(start, end)
        return self.window.intersect(timeframe)
Пример #23
0
    def get_timeframe(self, key):
        """
        The key is immediatly in the correct form, required to do the request.
        It has the form: {lat}/{lng}/{deviceKey}/{deviceType}

        Returns
        -------
        nilmtk.TimeFrame of entire table after intersecting with self.window.
        """
        timeframe = self._execute_request("get_timeframe",
                                          type="GET",
                                          parameters={"url": key})
        start = pd.Timestamp(timeframe[0])
        end = pd.Timestamp(timeframe[1])
        timeframe = TimeFrame(start, end)
        return timeframe
Пример #24
0
    def invert(self, start=None, end=None):
        ''' 
        Returns a TimeFrameGroup with inverted rectangles.
        That means where there was a gap before is now a 
        TimeFrame and vice versa.

        Paramter
        --------
        start, end: pd.TimeStamp
            Defining the start and end of the region to invert.

        Returns
        -------
        Inversion: pd.TimeFrameGroup
            The inverted timeframegroup, with the section beeing the 
            gaps and the other ways arround.
        '''
        if self._df.empty:
            if not start is None and not end is None:
                return TimeFrameGroup([TimeFrame(start=start, end=end)])
            return TimeFrameGroup()

        inversion = self._df.copy()
        if self._df.iloc[-1, :]["section_end"] < end:
            val_to_append = self._df.iloc[-1, :]["section_start"]
            inversion['section_end'] = inversion['section_end'].shift(1)
            row = len(inversion)
            inversion.loc[row, :] = [start, start]
            inversion.loc[row, 'section_start'] = end
            inversion.loc[row, 'section_end'] = val_to_append

        else:
            inversion['section_end'] = inversion['section_end'].shift(1)
        if not start is None and start < self._df.iloc[-1, :]['section_start']:
            inversion.loc[0, 'section_end'] = start

        inversion = inversion.dropna().rename(columns={
            "section_end": "section_start",
            "section_start": "section_end"
        })
        if not start is None and inversion.loc[0, 'section_start'] < start:
            inversion.loc[0, 'section_start'] = start
        if not end is None and inversion.loc[inversion.index[-1],
                                             'section_end'] > end:
            inversion.loc[inversion.index[-1], 'section_end'] = end

        return TimeFrameGroup(inversion)
Пример #25
0
def _timeframe_for_chunk(there_are_more_subchunks, chunk_i, window_intersect,
                         index):
    start = None
    end = None

    # Test if there are any more subchunks
    if there_are_more_subchunks:
        if chunk_i == 0:
            start = window_intersect.start
    elif chunk_i > 0:
        # This is the last subchunk
        end = window_intersect.end
    else:
        # Just a single 'subchunk'
        start = window_intersect.start
        end = window_intersect.end

    if start is None:
        start = index[0]
    if end is None:
        end = index[-1]

    return TimeFrame(start, end)
Пример #26
0
    def load(self,
             key,
             cols=None,
             sections=None,
             n_look_ahead_rows=0,
             chunksize=MAX_MEM_ALLOWANCE_IN_BYTES,
             verbose=False):

        # TODO: calculate chunksize default based on physical
        # memory installed and number of columns

        # Make sure key has a slash at the front but not at the end.
        if key[0] != '/':
            key = '/' + key
        if len(key) > 1 and key[-1] == '/':
            key = key[:-1]

        # Make sure chunksize is an int otherwise `range` complains later.
        chunksize = np.int64(chunksize)

        # Set `sections` variable
        sections = [TimeFrame()] if sections is None else sections
        if isinstance(sections, pd.PeriodIndex):
            sections = timeframes_from_periodindex(sections)

        if verbose:
            print("HDFDataStore.load. key='{}'".format(key))

        self.all_sections_smaller_than_chunksize = True

        for section in sections:
            if verbose:
                print("   ", section)
            window_intersect = self.window.intersect(section)

            if window_intersect.empty:
                data = pd.DataFrame()
                data.timeframe = section
                yield data
                continue

            terms = window_intersect.query_terms('window_intersect')
            if terms is None:
                section_start_i = 0
                section_end_i = self.store.get_storer(key).nrows
                if section_end_i <= 1:
                    data = pd.DataFrame()
                    data.timeframe = section
                    yield data
                    continue
            else:
                try:
                    coords = self.store.select_as_coordinates(key=key,
                                                              where=terms)
                except AttributeError as e:
                    if str(e) == ("'NoneType' object has no attribute "
                                  "'read_coordinates'"):
                        raise KeyError("key '{}' not found".format(key))
                    else:
                        raise
                n_coords = len(coords)
                if n_coords == 0:
                    data = pd.DataFrame()
                    data.timeframe = window_intersect
                    yield data
                    continue

                section_start_i = coords[0]
                section_end_i = coords[-1]
                del coords
            slice_starts = xrange(section_start_i, section_end_i, chunksize)
            n_chunks = int(
                np.ceil((section_end_i - section_start_i) / chunksize))

            if n_chunks > 1:
                self.all_sections_smaller_than_chunksize = False

            print("n_chunks", n_chunks)
            for chunk_i, chunk_start_i in enumerate(slice_starts):
                chunk_end_i = chunk_start_i + chunksize
                there_are_more_subchunks = (chunk_i < n_chunks - 1)

                if chunk_end_i > section_end_i:
                    chunk_end_i = section_end_i
                chunk_end_i += 1

                data = self.store.select(key=key,
                                         columns=cols,
                                         start=chunk_start_i,
                                         stop=chunk_end_i)

                # if len(data) <= 2:
                #     yield pd.DataFrame()

                # Load look ahead if necessary
                if n_look_ahead_rows > 0:
                    if len(data.index) > 0:
                        look_ahead_start_i = chunk_end_i
                        look_ahead_end_i = look_ahead_start_i + n_look_ahead_rows
                        try:
                            data.look_ahead = self.store.select(
                                key=key,
                                columns=cols,
                                start=look_ahead_start_i,
                                stop=look_ahead_end_i)
                        except ValueError:
                            data.look_ahead = pd.DataFrame()
                    else:
                        data.look_ahead = pd.DataFrame()

                data.timeframe = _timeframe_for_chunk(there_are_more_subchunks,
                                                      chunk_i,
                                                      window_intersect,
                                                      data.index)
                yield data
                del data
    def _save_metadata_for_disaggregation(
        self,
        output_datastore,
        sample_period,
        measurement,
        timeframes,
        building,
        meters=None,
        num_meters=None,
        supervised=True,
    ):
        """Add metadata for disaggregated appliance estimates to datastore.

        This method returns nothing.  It sets the metadata
        in `output_datastore`.

        Note that `self.MODEL_NAME` needs to be set to a string before
        calling this method.  For example, we use `self.MODEL_NAME = 'CO'`
        for Combinatorial Optimisation.

        Parameters
        ----------
        output_datastore : nilmtk.DataStore subclass object
            The datastore to write metadata into.
        sample_period : int
            The sample period, in seconds, used for both the
            mains and the disaggregated appliance estimates.
        measurement : 2-tuple of strings
            In the form (<physical_quantity>, <type>) e.g.
            ("power", "active")
        timeframes : list of nilmtk.TimeFrames or nilmtk.TimeFrameGroup
            The TimeFrames over which this data is valid for.
        building : int
            The building instance number (starting from 1)
        supervised : bool, defaults to True
            Is this a supervised NILM algorithm?
        meters : list of nilmtk.ElecMeters, optional
            Required if `supervised=True`
        num_meters : int
            Required if `supervised=False`
        """

        # TODO: `preprocessing_applied` for all meters
        # TODO: submeter measurement should probably be the mains
        #       measurement we used to train on, not the mains measurement.

        # DataSet and MeterDevice metadata:
        building_path = "/building{}".format(building)
        mains_data_location = building_path + "/elec/meter1"

        meter_devices = {
            self.MODEL_NAME: {
                "model": self.MODEL_NAME,
                "sample_period": sample_period,
                "max_sample_period": sample_period,
                "measurements": [{"physical_quantity": measurement[0], "type": measurement[1]}],
            },
            "mains": {
                "model": "mains",
                "sample_period": sample_period,
                "max_sample_period": sample_period,
                "measurements": [{"physical_quantity": measurement[0], "type": measurement[1]}],
            },
        }

        merged_timeframes = merge_timeframes(timeframes, gap=sample_period)
        total_timeframe = TimeFrame(merged_timeframes[0].start, merged_timeframes[-1].end)

        date_now = datetime.now().isoformat().split(".")[0]
        dataset_metadata = {
            "name": self.MODEL_NAME,
            "date": date_now,
            "meter_devices": meter_devices,
            "timeframe": total_timeframe.to_dict(),
        }
        output_datastore.save_metadata("/", dataset_metadata)

        # Building metadata

        # Mains meter:
        elec_meters = {
            1: {
                "device_model": "mains",
                "site_meter": True,
                "data_location": mains_data_location,
                "preprocessing_applied": {},  # TODO
                "statistics": {"timeframe": total_timeframe.to_dict()},
            }
        }

        def update_elec_meters(meter_instance):
            elec_meters.update(
                {
                    meter_instance: {
                        "device_model": self.MODEL_NAME,
                        "submeter_of": 1,
                        "data_location": ("{}/elec/meter{}".format(building_path, meter_instance)),
                        "preprocessing_applied": {},  # TODO
                        "statistics": {"timeframe": total_timeframe.to_dict()},
                    }
                }
            )

        # Appliances and submeters:
        appliances = []
        if supervised:
            for meter in meters:
                meter_instance = meter.instance()
                update_elec_meters(meter_instance)

                for app in meter.appliances:
                    appliance = {
                        "meters": [meter_instance],
                        "type": app.identifier.type,
                        "instance": app.identifier.instance
                        # TODO this `instance` will only be correct when the
                        # model is trained on the same house as it is tested on
                        # https://github.com/nilmtk/nilmtk/issues/194
                    }
                    appliances.append(appliance)

                # Setting the name if it exists
                if meter.name:
                    if len(meter.name) > 0:
                        elec_meters[meter_instance]["name"] = meter.name
        else:  # Unsupervised
            # Submeters:
            # Starts at 2 because meter 1 is mains.
            for chan in range(2, num_meters + 2):
                update_elec_meters(meter_instance=chan)
                appliance = {
                    "meters": [chan],
                    "type": "unknown",
                    "instance": chan - 1
                    # TODO this `instance` will only be correct when the
                    # model is trained on the same house as it is tested on
                    # https://github.com/nilmtk/nilmtk/issues/194
                }
                appliances.append(appliance)

        building_metadata = {"instance": building, "elec_meters": elec_meters, "appliances": appliances}

        output_datastore.save_metadata(building_path, building_metadata)
Пример #28
0
def _get_good_sections(df, sample_period):
    """
    Code copied from nilmtk[1]/nilmtk/stats/goodsections.py
    
    [1] https://github.com/nilmtk/nilmtk/
    """
    index = df.dropna().sort_index().index
    df_time_end = df.index[-1] + pd.Timedelta(seconds=sample_period)
    del df

    if len(index) < 2:
        return []

    timedeltas_sec = timedelta64_to_secs(np.diff(index.values))
    timedeltas_check = timedeltas_sec <= sample_period

    # Memory management
    del timedeltas_sec
    gc.collect()

    timedeltas_check = np.concatenate(
        [[False],
         timedeltas_check])
    transitions = np.diff(timedeltas_check.astype(np.int))

    # Memory management
    last_timedeltas_check = timedeltas_check[-1]
    del timedeltas_check
    gc.collect()

    good_sect_starts = list(index[:-1][transitions ==  1])
    good_sect_ends   = list(index[:-1][transitions == -1])

    # Memory management
    last_index = index[-1]
    del index
    gc.collect()

    # Work out if this chunk ends with an open ended good section
    if len(good_sect_ends) == 0:
        ends_with_open_ended_good_section = (
            len(good_sect_starts) > 0)
    elif len(good_sect_starts) > 0:
        # We have good_sect_ends and good_sect_starts
        ends_with_open_ended_good_section = (
            good_sect_ends[-1] < good_sect_starts[-1])
    else:
        # We have good_sect_ends but no good_sect_starts
        ends_with_open_ended_good_section = False

    if ends_with_open_ended_good_section:
        good_sect_ends += [df_time_end]

    assert len(good_sect_starts) == len(good_sect_ends)

    sections = [TimeFrame(start, end)
                for start, end in zip(good_sect_starts, good_sect_ends)
                if not (start == end and start is not None)]

    # Memory management
    del good_sect_starts
    del good_sect_ends
    gc.collect()

    return sections
    def add_metadata(self, output_datastore, measurement, timeframes, mains, timezone, load_kwargs):


        date_now = datetime.now().isoformat().split('.')[0]
        output_name = load_kwargs.pop('output_name', 'NILMTK_CO_' + date_now)        
        resample_seconds = load_kwargs.pop('resample_seconds', 60)        
        
        building_path = '/building{}'.format(mains.building())
        mains_data_location = '{}/elec/meter1'.format(building_path)
        
        # DataSet and MeterDevice metadata:
        meter_devices = {
            'CO': {
                'model': 'CO',
                'sample_period': resample_seconds,
                'max_sample_period': resample_seconds,
                'measurements': [{
                    'physical_quantity': measurement[0],
                    'type': measurement[1]
                }]
            },
            'mains': {
                'model': 'mains',
                'sample_period': resample_seconds,
                'max_sample_period': resample_seconds,
                'measurements': [{
                    'physical_quantity': measurement[0],
                    'type': measurement[1]
                }]
            }
        }

        merged_timeframes = merge_timeframes(timeframes, gap=resample_seconds)
        total_timeframe = TimeFrame(merged_timeframes[0].start,
                                    merged_timeframes[-1].end)

        dataset_metadata = {'name': output_name, 'date': date_now,
                            'meter_devices': meter_devices,
                            'timeframe': total_timeframe.to_dict(),
                            'timezone': timezone}
        output_datastore.save_metadata('/', dataset_metadata)

        # Building metadata

        # Mains meter:
        elec_meters = {
            1: {
                'device_model': 'mains',
                'site_meter': True,
                'data_location': mains_data_location,
                'preprocessing_applied': {},  # TODO
                'statistics': {
                    'timeframe': total_timeframe.to_dict(),
                    'good_sections': list_of_timeframe_dicts(merged_timeframes)
                }
            }
        }

        # Appliances and submeters:
        appliances = []
        for model in self.model:
            meter = model['training_metadata']

            meter_instance = meter.instance()

            for app in meter.appliances:
                meters = app.metadata['meters']
                appliance = {
                    'meters': [meter_instance], 
                    'type': app.identifier.type,
                    'instance': app.identifier.instance
                }
                appliances.append(appliance)

            elec_meters.update({
                meter_instance: {
                    'device_model': 'CO',
                    'submeter_of': 1,
                    'data_location': ('{}/elec/meter{}'
                                      .format(building_path, meter_instance)),
                    'preprocessing_applied': {},  # TODO
                    'statistics': {
                        'timeframe': total_timeframe.to_dict(),
                        'good_sections': list_of_timeframe_dicts(merged_timeframes)
                    }
                }
            })

            #Setting the name if it exists
            if meter.name:
                if len(meter.name)>0:
                    elec_meters[meter_instance]['name'] = meter.name

        building_metadata = {
            'instance': mains.building(),
            'elec_meters': elec_meters,
            'appliances': appliances
        }

        output_datastore.save_metadata(building_path, building_metadata)
Пример #30
0
    def _positionActivation(self,
                            activation,
                            application,
                            building,
                            windowLen,
                            activationIndex,
                            isReal=True):
        startTime = activation.index[0]
        endTime = activation.index[-1]
        if (len(activation) < windowLen):
            addnum = windowLen - len(activation)
            an = self.rng.randint(0, addnum)
            bn = addnum - an
            positioned_activation = np.pad(activation.values,
                                           pad_width=(an, 0),
                                           mode='constant')
            positioned_activation = np.pad(positioned_activation,
                                           pad_width=(0, bn),
                                           mode='constant')
            seq_start_time = activation.index[0] - timedelta(seconds=an * 6)
            index = pd.date_range(seq_start_time,
                                  periods=windowLen,
                                  freq="{:d}S".format(6))

            if isReal:
                intersections = []
                activationsnum = len(
                    self.activationsApp[application][building])
                if an > 0 and activationIndex >= 1:
                    beforeStart = TimeFrame(
                        startTime - timedelta(seconds=an * 6), startTime)
                    ai = activationIndex - 1
                    beforeActivation = self.activationsApp[application][
                        building][ai]
                    beforeSection = TimeFrame(beforeActivation.index[0],
                                              beforeActivation.index[-1])
                    intersection = beforeSection.intersection(beforeStart)
                    while intersection.start != None and intersection.end != None:
                        intersections.append(intersection)
                        ai = ai - 1
                        if ai < 0:
                            break
                        beforeActivation = self.activationsApp[application][
                            building][ai]
                        beforeSection = TimeFrame(beforeActivation.index[0],
                                                  beforeActivation.index[-1])
                        intersection = beforeSection.intersection(beforeStart)
                if bn > 0 and activationIndex < activationsnum - 1:
                    afterEnd = TimeFrame(endTime,
                                         endTime + timedelta(seconds=bn * 6))
                    bi = activationIndex + 1
                    afterActivation = self.activationsApp[application][
                        building][bi]
                    afterSection = TimeFrame(afterActivation.index[0],
                                             afterActivation.index[-1])
                    intersection = afterSection.intersection(afterEnd)
                    while intersection.start != None and intersection.end != None:
                        intersections.append(intersection)
                        bi = bi + 1
                        if bi >= activationsnum:
                            break
                        afterActivation = self.activationsApp[application][
                            building][bi]
                        afterSection = TimeFrame(afterActivation.index[0],
                                                 afterActivation.index[-1])
                        intersection = afterSection.intersection(afterEnd)

                for intersection in intersections:
                    intersectionStart = intersection.start
                    intersectionEnd = intersection.end
                    length = int(
                        (intersectionEnd - intersectionStart).total_seconds() /
                        6) + 1
                    offset = int(
                        (intersectionStart - seq_start_time).total_seconds() /
                        6)
                    positioned_activation[
                        offset:offset + length] = positioned_activation[
                            offset:offset +
                            length] + self.elecApp[application][building][
                                intersectionStart:intersectionEnd].values

            positioned_activation_series = pd.Series(positioned_activation,
                                                     index=index)
        else:
            positioned_activation_series = activation[:windowLen]
        if len(positioned_activation_series) != windowLen:
            logger.error("error")
        return positioned_activation_series
Пример #31
0
    def __init__(self, **config):
        if 'filename' not in config.keys():
            self.dataSet = nilmtk.DataSet("ukdale.h5")
        else:
            self.dataSet = nilmtk.DataSet(config['fileName'])

        if 'startTime' not in config.keys() or 'endTime' not in config.keys():
            self.dataSet.set_window("2012-11-01", "2015-01-31")
        else:
            self.dataSet.set_window(config['startTime'], config['endTime'])

        if 'trainBuildings' not in config.keys():
            self.trainBuildings = [1, 3, 4, 5]
        else:
            self.trainBuildings = config['trainBuildings']
        if 'testBuildings' not in config.keys():
            self.testBuildings = [2]
        else:
            self.testBuildings = config['testBuildings']

        if 'applications' not in config.keys():
            raise KeyError("please input applications")
        self.applications = config['applications']

        if 'targetapplication' not in config.keys():
            raise KeyError("please input targetapplication")
        self.targetApplication = config['targetapplication']

        if 'randSeed' not in config.keys():
            randSeed = 0
        else:
            randSeed = config['randSeed']

        self.otherApplications = [
            i for i in self.applications if i not in [self.targetApplication]
        ]
        self.allBuildings = set(self.trainBuildings + self.testBuildings)
        self.window = 599
        self.inputSeqs = []
        self.targetSeqs = []
        self.rng = np.random.RandomState(randSeed)
        activationConfig = {
            'fridge': {
                'min_off_duration': 18,  # 12 in paper here
                'min_on_duration': 60,
                'on_power_threshold': 50,
                'sample_period': 6,
            },
            'kettle': {
                'min_off_duration': 18,  # 0 in paper here
                'min_on_duration': 12,
                'on_power_threshold': 2000,
                'sample_period': 6,
            },
            'washing machine': {
                'min_off_duration': 160,
                'min_on_duration': 1800,
                'on_power_threshold': 20,
                'sample_period': 6,
            },
            'microwave': {
                'min_off_duration': 30,
                'min_on_duration': 12,
                'on_power_threshold': 200,
                'sample_period': 6,
            },
            'dish washer': {
                'min_off_duration': 1800,
                'min_on_duration': 1800,
                'on_power_threshold': 10,
                'sample_period': 6,
            }
        }

        self.elecMains = {}
        self.goodSections = {}
        for building in self.allBuildings:
            self.goodSections[building] = self.dataSet.buildings[
                building].elec.mains().good_sections()
            self.elecMains[building] = self.dataSet.buildings[
                building].elec.mains().power_series_all_data(
                    sample_period=6,
                    sections=self.goodSections[building]).dropna()

        self.numApp = {}
        self.elecApp = {}
        self.activationsApp = {}
        self.activationAppSections = {}
        for app in self.applications:
            self.elecApp[app] = {}
            self.activationsApp[app] = {}
            self.numApp[app] = 0
            self.activationAppSections[app] = {}
            for building in self.allBuildings:
                try:
                    self.elecApp[app][building] = self.dataSet.buildings[
                        building].elec[app].power_series_all_data(
                            sample_period=6).dropna()

                    self.activationsApp[app][
                        building] = self.dataSet.buildings[building].elec[
                            app].get_activations(**activationConfig[app])
                    self.activationsApp[app][building] = [
                        activation.astype(np.float32)
                        for activation in self.activationsApp[app][building]
                    ]
                    self.numApp[app] += len(self.activationsApp[app][building])
                    self.activationAppSections[app][building] = TimeFrameGroup(
                    )
                    for activation in self.activationsApp[app][building]:
                        self.activationAppSections[app][building].append(
                            TimeFrame(activation.index[0],
                                      activation.index[-1]))
                except KeyError as exception:
                    logger.info(
                        str(building) + " has no " + app +
                        ". Full exception: {}".format(exception))
                    continue
        logger.info("Done loading NILMTK data.")

        for building in self.allBuildings:
            activationsToRemove = []
            try:
                activations = self.activationsApp[
                    self.targetApplication][building]
                mains = self.elecMains[building]
                for i, activation in enumerate(activations):
                    activationDuration = (activation.index[-1] -
                                          activation.index[0])
                    start = (activation.index[0] - activationDuration)
                    end = (activation.index[-1] + activationDuration)
                    if start < mains.index[0] or end > mains.index[-1]:
                        activationsToRemove.append(i)
                    else:
                        mainsForAct = mains[start:end]
                        if not self._hasSufficientSamples(
                                start, end, mainsForAct):
                            activationsToRemove.append(i)
                activationsToRemove.reverse()
                for i in activationsToRemove:
                    activations.pop(i)
                self.activationsApp[
                    self.targetApplication][building] = activations
            except KeyError as exception:
                continue

        self.sectionsWithNoTarget = {}
        for building in self.allBuildings:
            try:
                activationsTarget = self.activationsApp[
                    self.targetApplication][building]
                mainGoodSections = self.goodSections[building]
                mains = self.elecMains[building]
                gapsBetweenActivations = TimeFrameGroup()
                prev = mains.index[0]
                for activation in activationsTarget:
                    try:
                        p2 = prev
                        gapsBetweenActivations.append(
                            TimeFrame(prev, activation.index[0]))
                        prev = activation.index[-1]
                        p1 = activation.index[0]
                    except ValueError:
                        logger.debug("----------------------")
                        logger.debug(p1)
                        logger.debug(p2)
                        logger.debug(activation.index[0])
                        logger.debug(activation.index[-1])

                gapsBetweenActivations.append(TimeFrame(prev, mains.index[-1]))

                intersection = gapsBetweenActivations.intersection(
                    mainGoodSections)
                intersection = intersection.remove_shorter_than(6 *
                                                                self.window)
                self.sectionsWithNoTarget[building] = intersection
            except KeyError:
                continue
Пример #32
0
def load_nilmtk_activations( dataset_paths,
                             target_appliance_name,
                             appliance_names,
                             on_power_threshold,
                             min_on_duration,
                             min_off_duration,
                             sample_period,
                             windows,
                             sanity_check=1 ):
    """
    Parameters
    ----------
    windows : dict
        Structure example:
        {
            'UKDALE': {
                'train': {<building_i>: <window>},
                'unseen_activations_of_seen_appliances': {<building_i>: <window>},
                'unseen_appliances': {<building_i>: <window>}
            }
        }

    Returns
    -------
    all_activations : dict
        Structure example:
        {<train | unseen_appliances | unseen_activations_of_seen_appliances>: {
             <appliance>: {
                 <building_name>: [<activations>]
        }}}
        Each activation is a pd.Series with DatetimeIndex and the following
        metadata attributes: building, appliance, fold.
    """
    logger.info("Loading NILMTK activations...")

    if sanity_check:
        # Sanity check
        for dataset in windows:
            check_windows(windows[dataset])
    
    all_activations = {}
    for dataset_name, folds in windows.items():
        # Load dataset
        dataset = nilmtk.DataSet(dataset_paths[dataset_name])
        appliance_aliases = appliance_names[dataset_name][target_appliance_name]
        
        for fold, buildings_and_windows in folds.items():
            logger.info(
                "Loading activations for fold {}.....".format(fold))         
            for building_i, windows_for_building in buildings_and_windows.items():
                #dataset.set_window(*window)
                elec = dataset.buildings[building_i].elec
                building_name = (
                    dataset.metadata['name'] + '_building_{}'.format(building_i))
                
                appliance_meters = []
                for meter in elec.meters:
                    if meter.is_site_meter():
                        continue

                    append_meter = False
                    for a in meter.appliances:
                        if a.type['type'] in appliance_aliases:
                            append_meter = True
                    if append_meter:
                        appliance_meters.append(meter)
                        print(meter.appliances)

                if not appliance_meters:
                    logger.info(
                        "No {} found in {}".format(target_appliance_name, building_name))
                    continue

                #if appliance_meters:
                if len(appliance_meters) > 1:
                    meter = nilmtk.MeterGroup(meters=appliance_meters)
                else:
                    meter = appliance_meters[0]
                logger.info(
                    "Loading {} for {}...".format(target_appliance_name, building_name))

                meter_activations = []
                for window in windows_for_building:
                    if dataset_name == "ECO":
                        dataset.store.window = TimeFrame(start=window[0], end=window[1], tz='GMT')
                    else:
                        dataset.set_window(*window) # does not work for ECO
                    # Get activations_for_fold and process them
                    meter_activations_for_building = meter.get_activations(
                        sample_period=sample_period,
                        min_off_duration=min_off_duration,
                        min_on_duration=min_on_duration,
                        on_power_threshold=on_power_threshold,
                        resample_kwargs={'fill_method': 'ffill', 'how': 'mean', 'limit': 20})
                    #meter_activations_for_building = [activation.astype(np.float32)
                    #                     for activation in meter_activations_for_building]
                    meter_activations.extend(meter_activations_for_building)

                # Attach metadata
                #for activation in meter_activations:
                #    activation._metadata = copy(activation._metadata)
                #    activation._metadata.extend(
                #        ["building", "appliance", "fold"])
                #    activation.building = building_name
                #    activation.appliance = appliance
                #    activation.fold = fold

                # Save
                if meter_activations:
                    all_activations.setdefault(
                        fold, {}).setdefault(
                        target_appliance_name, {})[building_name] = meter_activations
                logger.info(
                    "Loaded {} {} activations from {}."
                    .format(len(meter_activations), target_appliance_name, building_name))

        dataset.store.close()
        
    logger.info("Done loading NILMTK activations.")
    return all_activations
Пример #33
0
def load_data_from_nilmtk_datasets(windows, dataset_paths, appliances, target_appliance_name, sample_period):
    data = {}
    data_good_sections = {}

    logger.info("Loading NILMTK data...")

    for dataset_name, folds in windows.items():
        # Load dataset
        dataset = nilmtk.DataSet(dataset_paths[dataset_name])

        for fold, buildings_and_windows in folds.items():
            for building_i, windows_for_building in buildings_and_windows.items():
                dataset.set_window(None, None)
                elec = dataset.buildings[building_i].elec

                building_name = (
                    dataset.metadata['name'] +
                    '_building_{}'.format(building_i))
                logger.info(
                    "Loading data for {}...".format(building_name))
                mains_meter = elec.mains()
                good_sections = get_effective_good_sections(mains_meter)

                appliance_aliases = appliances[dataset_name][target_appliance_name]
                appliance_meters = []
                for meter in elec.meters:
                    if meter.is_site_meter():
                        continue

                    if len(meter.appliances) == 1:
                        appliancetype = meter.appliances[0].type['type']
                        if appliancetype in appliance_aliases:
                            appliance_meters.append(meter)
                    else:
                        append_meter = False
                        for a in meter.appliances:
                            if a.type['type'] in appliance_aliases:
                                append_meter = True
                        if append_meter:
                            appliance_meters.append(meter)
                            print(meter.appliances)

                if not appliance_meters:
                    logger.info(
                        "No {} found in {}".format(target_appliance_name, building_name))
                    continue

                if len(appliance_meters) > 1:
                    appliance_metergroup = nilmtk.MeterGroup(meters=appliance_meters)
                else:
                    appliance_metergroup = appliance_meters[0]
                data_good_sections.setdefault(fold, {})[building_name] = good_sections

                def load_data(meter):
                    df = meter.power_series_all_data(
                        sample_period=sample_period
                        )
                    if df is not None:
                        return df.astype(np.float32).dropna()
                    else:
                        return None

                dfs = []
                for window in windows_for_building:
                    if dataset_name == "ECO":
                        dataset.store.window = TimeFrame(start=window[0], end=window[1], tz='GMT')
                    else:
                        if window is None:
                            ipdb.set_trace() # Something has gone wrong...see what happend!
                        dataset.set_window(*window) # does not work for ECO
                    #ipdb.set_trace()
                    mains_data = load_data(mains_meter)
                    appliance_data = load_data(appliance_metergroup)
                    if (mains_data is None) or (appliance_data is None):
                        continue
                    df = pd.DataFrame(
                        {'mains': mains_data, 'target': appliance_data},
                        dtype=np.float32).dropna()
                    del mains_data
                    del appliance_data
                    if not df.empty:
                        dfs.append(df)

                df = pd.concat(dfs, axis=0)
                dfs = []
                for gs in good_sections:
                    dfslice = gs.slice(df)
                    if not dfslice.empty:
                        dfs.append(dfslice)
                df = pd.concat(dfs, axis=0)

                if not df.empty:
                    data.setdefault(fold, {})[building_name] = df

                logger.info(
                    "Loaded data from building {} for fold {}"
                    " from {} to {}."
                    .format(building_name, fold, df.index[0], df.index[-1]))

        dataset.store.close()

    logger.info("Done loading NILMTK data.")
    return data, data_good_sections
Пример #34
0
    def test_intersection(self):
        tf = TimeFrame("2012-01-01 00:00:00", "2013-01-01 00:00:00")
        self.assertFalse(tf.empty)

        new_tf = tf.intersection(tf)
        self.assertEqual(tf, new_tf)
        self.assertFalse(new_tf.empty)

        new_tf = tf.intersection(TimeFrame())
        self.assertEqual(tf, new_tf)
        self.assertFalse(new_tf.empty)

        new_tf = tf.intersection(TimeFrame(start="1990-01-01"))
        self.assertEqual(tf, new_tf)
        self.assertFalse(new_tf.empty)

        new_tf = tf.intersection(TimeFrame(end="2100-01-01"))
        self.assertEqual(tf, new_tf)
        self.assertFalse(new_tf.empty)

        small_tf = TimeFrame("2012-01-05 00:00:00", "2012-01-06 00:00:00")
        new_tf = tf.intersection(small_tf)
        self.assertEqual(small_tf, new_tf)
        self.assertFalse(new_tf.empty)

        large_tf = TimeFrame("2010-01-01 00:00:00", "2014-01-01 00:00:00")
        new_tf = tf.intersection(large_tf)
        self.assertEqual(tf, new_tf)
        self.assertFalse(new_tf.empty)

        disjoint = TimeFrame("2015-01-01", "2016-01-01")
        new_tf = tf.intersection(disjoint)
        self.assertTrue(new_tf.empty)

        # try intersecting with emtpy TF
        new_tf = tf.intersection(new_tf)
        self.assertTrue(new_tf.empty)

        disjoint = TimeFrame("2015-01-01", "2016-01-01")
        tf.enabled = False
        new_tf = tf.intersection(disjoint)
        self.assertEqual(new_tf, disjoint)
        self.assertFalse(new_tf.empty)
        tf.enabled = True

        # crop into the start of tf
        new_start = "2012-01-05 04:05:06"
        new_tf = tf.intersection(TimeFrame(start=new_start, end="2014-01-01"))
        self.assertEqual(new_tf, TimeFrame(start=new_start, end=tf.end))
        self.assertFalse(new_tf.empty)

        # crop into the end of tf
        new_end = "2012-01-07 04:05:06"
        new_tf = tf.intersection(TimeFrame(start="2011-01-01", end=new_end))
        self.assertEqual(new_tf, TimeFrame(start=tf.start, end=new_end))
        self.assertFalse(new_tf.empty)
Пример #35
0
    def load(self,
             key,
             columns=None,
             sections=None,
             n_look_ahead_rows=0,
             chunksize=MAX_MEM_ALLOWANCE_IN_BYTES,
             verbose=False):
        # TODO: calculate chunksize default based on physical
        # memory installed and number of columns

        # Make sure key has a slash at the front but not at the end.
        if key[0] != '/':
            key = '/' + key
        if len(key) > 1 and key[-1] == '/':
            key = key[:-1]

        # Make sure chunksize is an int otherwise `range` complains later.
        chunksize = np.int64(chunksize)

        # Set `sections` variable
        sections = [TimeFrame()] if sections is None else sections
        sections = TimeFrameGroup(sections)

        # Replace any Nones with '' in columns:
        if columns is not None:
            columns = [('' if pq is None else pq, '' if ac is None else ac)
                       for pq, ac in columns]

        if verbose:
            print("HDFDataStore.load(key='{}', columns='{}', sections='{}',"
                  " n_look_ahead_rows='{}', chunksize='{}')".format(
                      key, columns, sections, n_look_ahead_rows, chunksize))

        self.all_sections_smaller_than_chunksize = True

        for section in sections:
            if verbose:
                print("   ", section)
            window_intersect = self.window.intersection(section)

            if window_intersect.empty:
                data = pd.DataFrame()
                data.timeframe = section
                yield data
                continue

            terms = window_intersect.query_terms('window_intersect')
            if terms is None:
                section_start_i = 0
                section_end_i = self.store.get_storer(key).nrows
                if section_end_i <= 1:
                    data = pd.DataFrame()
                    data.timeframe = section
                    yield data
                    continue
            else:
                try:
                    coords = self.store.select_as_coordinates(key=key,
                                                              where=terms)
                except AttributeError as e:
                    if str(e) == ("'NoneType' object has no attribute "
                                  "'read_coordinates'"):
                        raise KeyError("key '{}' not found".format(key))
                    else:
                        raise
                n_coords = len(coords)
                if n_coords == 0:
                    data = pd.DataFrame()
                    data.timeframe = window_intersect
                    yield data
                    continue

                section_start_i = coords[0]
                section_end_i = coords[-1]
                del coords

            slice_starts = range(section_start_i, section_end_i, chunksize)
            n_chunks = int(
                np.ceil((section_end_i - section_start_i) / chunksize))

            if n_chunks > 1:
                self.all_sections_smaller_than_chunksize = False

            for chunk_i, chunk_start_i in enumerate(slice_starts):
                chunk_end_i = chunk_start_i + chunksize
                there_are_more_subchunks = (chunk_i < n_chunks - 1)

                if chunk_end_i > section_end_i:
                    chunk_end_i = section_end_i
                chunk_end_i += 1

                data = self.store.select(key=key,
                                         columns=columns,
                                         start=chunk_start_i,
                                         stop=chunk_end_i)

                # if len(data) <= 2:
                #     yield pd.DataFrame()

                # Load look ahead if necessary
                if n_look_ahead_rows > 0:
                    if len(data.index) > 0:
                        look_ahead_start_i = chunk_end_i
                        look_ahead_end_i = look_ahead_start_i + n_look_ahead_rows
                        try:
                            look_ahead = self.store.select(
                                key=key,
                                columns=columns,
                                start=look_ahead_start_i,
                                stop=look_ahead_end_i)
                        except ValueError:
                            look_ahead = pd.DataFrame()
                    else:
                        look_ahead = pd.DataFrame()

                    with warnings.catch_warnings():
                        # Silence "Pandas doesn't allow columns to be created via a new attribute name"
                        # since we're not adding a column
                        warnings.filterwarnings(
                            'ignore',
                            category=UserWarning,
                            message=".*Pandas doesn't allow columns.*")
                        setattr(data, 'look_ahead', look_ahead)

                data.timeframe = _timeframe_for_chunk(there_are_more_subchunks,
                                                      chunk_i,
                                                      window_intersect,
                                                      data.index)
                yield data
                del data
Пример #36
0
    def test_adjacent(self):
        # overlap
        tf1 = TimeFrame("2011-01-01 00:00:00", "2011-02-01 00:00:00")
        tf2 = TimeFrame("2011-02-01 00:00:00", "2011-03-01 00:00:00")
        self.assertTrue(tf1.adjacent(tf2))
        self.assertTrue(tf2.adjacent(tf1))

        # no overlap
        tf1 = TimeFrame("2011-01-01 00:00:00", "2011-02-01 00:00:00")
        tf2 = TimeFrame("2011-02-01 00:00:01", "2011-03-01 00:00:00")
        self.assertFalse(tf1.adjacent(tf2))
        self.assertFalse(tf2.adjacent(tf1))

        # no overlap but gap specified
        tf1 = TimeFrame("2011-01-01 00:00:00", "2011-02-01 00:00:00")
        tf2 = TimeFrame("2011-02-01 00:00:01", "2011-03-01 00:00:00")
        self.assertTrue(tf1.adjacent(tf2, gap=1))
        self.assertTrue(tf2.adjacent(tf1, gap=1))
        self.assertTrue(tf1.adjacent(tf2, gap=100))
        self.assertTrue(tf2.adjacent(tf1, gap=100))