def store_input(
        self,
        filepath_or_buffer,
        df_input=None,
        src_spec=None,
        dest_spec=None,
        file_extension=None,
    ):
        """For usage capturing input data for unit tests."""
        if not df_input:
            df_input = self.get_full_input()

        if not src_spec:
            src_spec = self.internal_spec

        if not dest_spec:
            dest_spec = self.destination.data_spec

        if not file_extension:
            file_extension = self.destination.file_extension

        _df = convert_spec(df=df_input,
                           src_spec=src_spec,
                           dest_spec=dest_spec,
                           copy=True)

        self.destination.write_data_by_extension(
            _df,
            filepath_or_buffer,
            data_spec=dest_spec,
            file_extension=file_extension,
        )
示例#2
0
    def test_put_data(self):
        sim_name = self.get_sim_name()
        _df = self.data_client.get_full_input()
        self.data_client.destination.put_data(_df,
                                              sim_name,
                                              src_spec=Internal())
        _gcs_uri = self.data_client.destination.get_gcs_uri(sim_name)

        r_df = pd.read_parquet(_gcs_uri)
        cr_df = convert_spec(
            r_df,
            src_spec=self.data_client.destination.data_spec,
            dest_spec=Internal(),
            src_nullable=True,
            dest_nullable=False,
        )

        # remove states not in dest spec
        for _col in _df.columns:
            _state = [
                v["internal_state"] for k, v in
                self.data_client.destination.data_spec.full.spec.items()
                if v["internal_state"] == _col
            ]
            if not _state:
                _df = _df.drop(columns=[_col])

        pd.testing.assert_frame_equal(_df, cr_df)
    def test_put_data(self):
        sim_name = self.get_sim_name()
        _df = self.data_client.get_full_input()
        self.data_client.destination.put_data(_df, sim_name, src_spec=Internal())
        _fpath = os.path.join(
            self.data_client.destination.local_cache,
            self.data_client.destination.operator_name,
            sim_name + "." + self.data_client.destination.file_extension,
        )
        r_df = pd.read_parquet(_fpath)
        cr_df = convert_spec(
            r_df,
            src_spec=self.data_client.destination.data_spec,
            dest_spec=Internal(),
            src_nullable=True,
            dest_nullable=False,
        )

        # remove states not in dest spec
        for _col in _df.columns:
            _state = [
                v["internal_state"]
                for k, v in self.data_client.destination.data_spec.full.spec.items()
                if v["internal_state"] == _col
            ]
            if not _state:
                _df = _df.drop(columns=[_col])

        pd.testing.assert_frame_equal(_df, cr_df)
 def put_data(self, df, sim_name, src_spec):
     _df = convert_spec(df=df,
                        src_spec=src_spec,
                        dest_spec=self.data_spec,
                        copy=True)
     local_cache_file = self.get_local_cache_file(sim_name)
     self.put_local_cache(_df, local_cache_file)
 def get_data(self, sim_config):
     """Get local cache"""
     local_cache_file = self.get_local_cache_file(
         identifier=sim_config["identifier"]
     )
     _data = self.get_local_cache(local_cache_file)
     _data = self.drop_unused_columns(_data=_data)
     _data = convert_spec(
         df=_data, src_spec=self.data_spec, dest_spec=Internal(), copy=False
     )
     return _data
 def get_data(self, sim_config):
     # first check if file in local cache
     local_cache_file = self.get_local_cache_file(
         identifier=sim_config["identifier"])
     _data = self.get_local_cache(local_cache_file)
     if _data.empty:
         _data = self.get_gcs_cache(sim_config, local_cache_file)
     _data = self.drop_unused_columns(_data=_data)
     _data = convert_spec(df=_data,
                          src_spec=self.data_spec,
                          dest_spec=Internal())
     return _data
示例#7
0
    def test_put_data(self):
        sim_name = self.get_sim_name()
        _df = self.data_client.get_full_input()
        self.data_client.destination.put_data(
            _df, sim_name, src_spec=Internal()
        )
        _gcs_uri = self.data_client.destination.get_gcs_uri(sim_name)

        r_df = pd.read_parquet(_gcs_uri)
        cr_df = convert_spec(
            r_df,
            src_spec=self.data_client.destination.data_spec,
            dest_spec=Internal(),
        )
        assert _df.equals(cr_df)
示例#8
0
 def test_put_data(self):
     sim_name = self.get_sim_name()
     _df = self.data_client.get_full_input()
     self.data_client.destination.put_data(_df,
                                           sim_name,
                                           src_spec=Internal())
     _fpath = os.path.join(
         self.data_client.destination.local_cache,
         self.data_client.destination.operator_name,
         sim_name + "." + self.data_client.destination.file_extension,
     )
     r_df = pd.read_parquet(_fpath)
     cr_df = convert_spec(
         r_df,
         src_spec=self.data_client.destination.data_spec,
         dest_spec=Internal(),
     )
     assert _df.equals(cr_df)
示例#9
0
    def get_data(self):
        # check for invalid start/end combination
        if self.sim_config["end_utc"] <= self.sim_config["start_utc"]:
            raise ValueError(
                "sim_config contains invalid start_utc >= end_utc.")
        # load from cache or download data from source
        _data = self.source.get_data(self.sim_config)
        if _data.empty:
            logger.error(
                "EMPTY DATA SOURCE: \nsim_config={} \nsource={}\n".format(
                    self.sim_config, self.source))
            _data = self.internal_spec.get_empty_df()

        # remove any fully duplicated records
        _data = _data.drop_duplicates(ignore_index=True)

        # remove multiple records for same datetime
        # there may also be multiple entries for same exact datetime in ISM
        # in this case keep the record that has the most combined runtime
        # because in observed cases of this the extra record has 0 runtime.
        _runtime_sum_column = "sum_runtime"
        _data[_runtime_sum_column] = _data[
            set(self.internal_spec.equipment.spec.keys())
            & set(_data.columns)].sum(axis=1)
        # last duplicate datetime value will have maximum sum_runtime
        _data = _data.sort_values(
            [self.internal_spec.datetime_column, _runtime_sum_column],
            ascending=True,
        )
        _data = _data.drop_duplicates(subset=[STATES.DATE_TIME],
                                      keep="last",
                                      ignore_index=True)
        _data = _data.drop(columns=[_runtime_sum_column])

        # truncate the data to desired simulation start and end time
        _data = _data[(_data[self.internal_spec.datetime_column] >=
                       self.sim_config["start_utc"])
                      & (_data[self.internal_spec.datetime_column] <=
                         self.sim_config["end_utc"])].reset_index(drop=True)

        # remove unused categories from categorical columns after date range
        # for simulation is selected
        for _cat_col in [
                _col for _col in _data.columns
                if isinstance(_data[_col].dtype, pd.api.types.CategoricalDtype)
        ]:
            _data[_cat_col].cat = _data[_cat_col].cat.remove_unused_categories(
            )

        # run settings change point detection before filling missing data
        # the fill data would create false positive change points
        # the change points can also be used to correctly fill the schedule
        # and comfort preferences
        (
            _change_points_schedule,
            _change_points_comfort_prefs,
            _change_points_hvac_mode,
        ) = ThermostatChannel.get_settings_change_points(
            _data, self.internal_spec.data_period_seconds)

        _expected_period = f"{self.internal_spec.data_period_seconds}S"
        # ffill first 15 minutes of missing data periods
        _data = DataClient.fill_missing_data(
            full_data=_data,
            expected_period=_expected_period,
            data_spec=self.internal_spec,
        )
        # compute full_data_periods with only first 15 minutes ffilled
        self.full_data_periods = DataClient.get_full_data_periods(
            full_data=_data,
            data_spec=self.internal_spec,
            expected_period=_expected_period,
            min_sim_period=self.sim_config["min_sim_period"],
        )

        # need time zone before init of DatetimeChannel
        internal_timezone = DateTimeChannel.get_timezone(
            self.sim_config["latitude"], self.sim_config["longitude"])

        # there will be filled data even if there are no full_data_periods
        # the fill data is present to run continuous simulations smoothly
        # in the presence of potentially many missing data periods
        if self.full_data_periods:
            # the simulation period must be full days starting at 0 hour to use
            # SimulationControl: Run Simulation for Weather File Run Periods
            _start_utc, _end_utc = self.get_simulation_period(
                expected_period=_expected_period,
                internal_timezone=internal_timezone,
            )

            # add records for warmup period
            _data = DataClient.add_fill_records(
                df=_data,
                data_spec=self.internal_spec,
                start_utc=_start_utc,
                end_utc=_end_utc,
                expected_period=_expected_period,
            )

            # drop records before and after full simulation time
            # end is less than
            _data = _data[
                (_data[self.internal_spec.datetime_column] >= _start_utc)
                & (_data[self.internal_spec.datetime_column] <= _end_utc
                   )].reset_index(drop=True)

            # bfill to interpolate missing data
            # first and last records must be full because we used full data periods
            # need to add a NA_code to stop fillna from clobbering columns
            # where NA means something
            na_code_name = "NA_code"
            _data[STATES.CALENDAR_EVENT].cat.add_categories(
                new_categories=na_code_name, inplace=True)
            _data[STATES.CALENDAR_EVENT] = _data[STATES.CALENDAR_EVENT].fillna(
                na_code_name)
            # bfill then ffill to handle where no data after null
            _data = _data.fillna(method="bfill", limit=None)
            _data = _data.fillna(method="ffill", limit=None)

            _data = DataClient.resample_to_step_size(
                df=_data,
                step_size_seconds=self.sim_config["sim_step_size_seconds"],
                data_spec=self.internal_spec,
            )

            # we can replace na_code_name now that filling is complete
            _data.loc[_data[STATES.CALENDAR_EVENT] == na_code_name,
                      [STATES.CALENDAR_EVENT], ] = pd.NA

            # finally convert dtypes to final types now that nulls in
            # non-nullable columns have been properly filled or removed
            _data = convert_spec(_data,
                                 src_spec=self.internal_spec,
                                 dest_spec=self.internal_spec,
                                 src_nullable=True,
                                 dest_nullable=False)

        else:
            raise ValueError(
                f"ID={self.sim_config['identifier']} has no full_data_periods "
                + "for requested duration: " +
                f"start_utc={self.sim_config['start_utc']}, " +
                f"end_utc={self.sim_config['end_utc']} " +
                f"with min_sim_period={self.sim_config['min_sim_period']}")

        self.datetime = DateTimeChannel(
            data=_data[self.internal_spec.intersect_columns(
                _data.columns, self.internal_spec.datetime.spec)],
            spec=self.internal_spec.datetime,
            latitude=self.sim_config["latitude"],
            longitude=self.sim_config["longitude"],
            internal_timezone=internal_timezone,
        )

        # finally create the data channel objs for usage during simulation
        self.thermostat = ThermostatChannel(
            data=_data[self.internal_spec.intersect_columns(
                _data.columns, self.internal_spec.thermostat.spec)],
            spec=self.internal_spec.thermostat,
            change_points_schedule=_change_points_schedule,
            change_points_comfort_prefs=_change_points_comfort_prefs,
            change_points_hvac_mode=_change_points_hvac_mode,
        )

        self.equipment = EquipmentChannel(
            data=_data[self.internal_spec.intersect_columns(
                _data.columns, self.internal_spec.equipment.spec)],
            spec=self.internal_spec.equipment,
        )

        self.sensors = SensorsChannel(
            data=_data[self.internal_spec.intersect_columns(
                _data.columns, self.internal_spec.sensors.spec)],
            spec=self.internal_spec.sensors,
        )
        self.sensors.drop_unused_room_sensors()
        self.weather = WeatherChannel(
            data=_data[self.internal_spec.intersect_columns(
                _data.columns, self.internal_spec.weather.spec)],
            spec=self.internal_spec.weather,
            archive_tmy3_dir=self.archive_tmy3_dir,
            archive_tmy3_data_dir=self.archive_tmy3_data_dir,
            ep_tmy3_cache_dir=self.ep_tmy3_cache_dir,
            simulation_epw_dir=self.simulation_epw_dir,
        )