def tsload(data_list): """Deserialize timeseries values from a dict to a structured DataFrame. :param list data_list: Timeseries values (as a list of dicts). :return Timeseries: Timeseries values loaded in structured DataFrame. """ if not data_list: return Timeseries() # Create dataframe and rename columns dataframe = pd.DataFrame(data_list) dataframe = dataframe.rename( columns={ 'timestamp': Timeseries.TIMESTAMPS_COL, 'value': Timeseries.DATA_COL, 'update_ts': Timeseries.UPDATE_TIMESTAMP_COL, 'quality': Timeseries.QUALITY_COL, }) # Reject if duplicate indexes dups = dataframe[Timeseries.TIMESTAMPS_COL].duplicated() if any(dups): abort(422, errors={ 'data': '{} duplicate indexe(s)'.format(np.count_nonzero(dups)) }) # Create and return timeseries return Timeseries.from_dataframe(dataframe)
def test_timeseries_resample(self): t_start = dt.datetime(2017, 1, 1) t_end = dt.datetime(2018, 1, 1) index = pd.date_range(t_start, t_end, freq='T', closed='left') data = range(len(index)) quality = [i % 2 for i in range(len(index))] ts0 = Timeseries(index=index, data=data, quality=quality) ts0.set_update_timestamp(index) ts1 = deepcopy(ts0) ts1.resample('month', 'first') assert len(ts1.dataframe) == 12 assert isclose(ts1.dataframe.data[0], 0) assert isclose(ts1.dataframe.data[1], 44640) assert isclose(ts1.dataframe.data[-1], 480960) assert np.all(np.isclose(ts1.dataframe.quality, 0.5)) ts2 = deepcopy(ts0) ts2.resample('day', 'max') assert len(ts2.dataframe) == 365 assert isclose(ts2.dataframe.data[0], 1439) assert isclose(ts2.dataframe.data[1], 2879) assert isclose(ts2.dataframe.data[-1], 525599) ts3 = deepcopy(ts0) ts3.resample('30min', 'sum') assert len(ts3.dataframe) == len(index) / 30 for i in range(len(ts3.dataframe)): assert isclose( ts3.dataframe.data[i], sum(range(30 * i, 30 * (i + 1))))
def test_hdfstore_timeseries_manager_quality_defaults_to_1(self, tmpdir): """Check quality read as NaN in DB is changed into 1 This is a patch for data registered before the default was introduced. """ t_start = dt.datetime(2017, 1, 1) t_end = dt.datetime(2017, 1, 2) index = pd.date_range(t_start, t_end, freq='H', closed='left') data = np.random.rand(len(index)) quality = np.random.rand(len(index)) mgr = HDFStoreTimeseriesMgr(str(tmpdir)) # Test with real values and NaN quality[::2] = [np.NaN] * len(quality[::2]) ts = Timeseries(index=index, data=data, quality=quality, update_ts=index) mgr.set('test', '/df', ts) df = mgr.get('test', '/df').dataframe assert (df.quality[::2] == 1).all() # Test with only NaN quality = [np.NaN] * len(quality) ts = Timeseries(index=index, data=data, quality=quality, update_ts=index) mgr.set('test', '/df', ts) df = mgr.get('test', '/df').dataframe assert (df.quality == 1).all()
def test_timeseries_aggregate(self): """Check aggregate function""" def generate_ts(number, random=False, nan=False): for _ in range(number): # Generate index t_start = dt.datetime(2017, 1, 1) t_end = dt.datetime(2018, 1, 1) index = pd.date_range(t_start, t_end, freq='D', closed='left') if nan: mask = np.random.choice([True, False], len(index)) index = index[mask] # Generate data if random: data = 100 * np.random.rand(len(index)) quality = np.random.randint(2, size=len(index)) else: data = range(len(index)) quality = [i % 2 for i in range(len(index))] yield Timeseries(index=index, data=data, quality=quality) number = 5 # Test with fixed data, with same length and same values list1 = list(generate_ts(number)) ts_len = len(list1[0].dataframe) aggregation = Timeseries.aggregate(list1, 'sum') assert isinstance(aggregation, Timeseries) assert len(aggregation.dataframe) == ts_len sample = aggregation.dataframe.sample(n=100) for index in sample.index.values: data_sum = sum([ts.dataframe.data[index] for ts in list1]) quality_mean = sum([ts.dataframe.quality[index] for ts in list1]) / number assert isclose(sample.data[index], data_sum) assert isclose(sample.quality[index], quality_mean) # Test with random data list2 = list(generate_ts(number, random=True, nan=True)) aggregation = Timeseries.aggregate(list2, 'max') assert isinstance(aggregation, Timeseries) sample = aggregation.dataframe.sample(n=100) for index in sample.index.values: data_max = max([ ts.dataframe.data[index] for ts in list2 if index in ts.dataframe.data.index ]) quality_mean = sum([ ts.dataframe.quality[index] for ts in list2 if index in ts.dataframe.quality.index ]) / number assert isclose(sample.data[index], data_max) assert isclose(sample.quality[index], quality_mean) assert 0 <= sample.data[index] <= 100 assert 0 <= sample.quality[index] <= 1
def test_hdfstore_timeseries_manager_datetime_awareness(self, tmpdir): """Check behavior with regard to datetime awareness""" # To test for awareness, see https://stackoverflow.com/a/27596917 def isaware(ts): return (ts.tzinfo is not None and ts.tzinfo.utcoffset(ts) is not None) def isnaive(ts): return ts.tzinfo is None or ts.tzinfo.utcoffset(ts) is None t1_n = dt.datetime(2017, 1, 1) t2_n = dt.datetime(2017, 1, 2) t2_a = dt.datetime(2017, 1, 2).replace(tzinfo=pytz.UTC) t3_a = dt.datetime(2017, 1, 3).replace(tzinfo=pytz.UTC) index_n = pd.date_range(t1_n, t2_n, freq='min', closed='left') index_a = pd.date_range(t2_a, t3_a, freq='min', closed='left') ts_n = Timeseries(index=index_n, data=np.random.rand(len(index_n)), update_ts=index_n) ts_a = Timeseries(index=index_a, data=np.random.rand(len(index_a)), update_ts=index_a) mgr = HDFStoreTimeseriesMgr(str(tmpdir)) # Naive index dataframe mgr.set('test', '/df_n', ts_n) df = mgr.get('test', '/df_n', t_start=t1_n, t_end=t2_n).dataframe ts_0 = df.index[0] assert isnaive(ts_0) # Aware index dataframe (TZ = UTC) mgr.set('test', '/df_a', ts_a) df = mgr.get('test', '/df_a', t_start=t2_a, t_end=t3_a).dataframe ts_0 = df.index[0] assert isaware(ts_0) # Mix both -> every index is considered UTC mgr.set('test', '/df_m', ts_n) mgr.set('test', '/df_m', ts_a) df = mgr.get('test', '/df_m', t_start=t1_n, t_end=t3_a).dataframe ts_0 = df.index[0] ts_1 = df.index[-1] assert isaware(ts_0) assert isaware(ts_1)
def test_hdfstore_timeseries_manager_sorted(self, tmpdir): """Check hdfstore returns a sorted index dataframe""" t_start1 = dt.datetime(2017, 1, 1) t_end1 = dt.datetime(2017, 1, 6) index1 = pd.date_range(t_start1, t_end1, freq='D', closed='left') t_start2 = dt.datetime(2017, 1, 3) t_end2 = dt.datetime(2017, 1, 5) index2 = pd.date_range(t_start2, t_end2, freq='D', closed='left') df1 = pd.DataFrame({'data': range(len(index1))}, index=index1) df2 = pd.DataFrame({'data': range(len(index2))}, index=index2) mgr = HDFStoreTimeseriesMgr(str(tmpdir)) mgr.set('test', 'df', Timeseries.from_dataframe(df1)) mgr.set('test', 'df', Timeseries.from_dataframe(df2)) ts_out = mgr.get('test', 'df', t_start=t_start1, t_end=t_end1) assert ts_out.dataframe.index.equals(index1)
def test_hdfstore_timeseries_manager_get_bounds(self, tmpdir): """Test get optional time bounds""" t_start = dt.datetime(2017, 1, 1) t_end = dt.datetime(2017, 1, 2) index = pd.date_range(t_start, t_end, freq='min', closed='left') mgr = HDFStoreTimeseriesMgr(str(tmpdir)) ts = Timeseries(index=index, data=np.random.rand(len(index))) mgr.set('test', '/df', ts) df = ts.dataframe new_df = mgr.get('test', '/df', t_start=t_start, t_end=t_end).dataframe assert new_df['data'].equals(df['data']) assert new_df.index.equals(df.index) new_df = mgr.get('test', '/df').dataframe assert new_df['data'].equals(df['data']) assert new_df.index.equals(df.index) new_df = mgr.get('test', '/df', t_start=t_start).dataframe assert new_df['data'].equals(df['data']) assert new_df.index.equals(df.index) new_df = mgr.get('test', '/df', t_end=t_end).dataframe assert new_df['data'].equals(df['data']) assert new_df.index.equals(df.index)
def test_timeseries_resample_no_upsample(self, operation): """Check resample does not upsample""" t_start = dt.datetime(2017, 1, 1) t_end = dt.datetime(2017, 1, 1, 0, 10) index = pd.date_range(t_start, t_end, freq='T', closed='left') data = range(len(index)) ts0 = Timeseries(index=index, data=data) ts1 = deepcopy(ts0) ts1.resample('sec', operation)
def test_timeseries_tsdump(self): timestamp_l = [ dt.datetime(2017, 1, 1) + dt.timedelta(n) for n in range(5) ] value_l = [0, 1, 2, 3, 4] quality_l = [1, 1, 0.5, None, 0.69] update_ts_l = [dt.datetime.now()] * 5 ts = Timeseries(index=timestamp_l, data=value_l, quality=quality_l) ts.set_update_timestamp(update_ts_l) assert np.isnan(ts.dataframe['quality'][3]) ts_list = tsdump(ts) assert len(ts_list) == 5 assert all(len(v) == 4 for v in ts_list) assert [v['timestamp'] for v in ts_list] == timestamp_l assert [v['value'] for v in ts_list] == value_l assert [v['update_ts'] for v in ts_list] == update_ts_l assert [v['quality'] for v in ts_list] == quality_l
def test_timeseries_init_empty_dataframe(self): ts = Timeseries() assert isinstance(ts.dataframe, pd.DataFrame) assert not len(ts.dataframe) assert ts.dataframe.index.name == 'index' assert set(ts.dataframe.columns.tolist()) == { 'data', 'quality', 'update_ts'} assert ts.dataframe.dtypes.tolist() == [ np.dtype('float64'), np.dtype('float64'), np.dtype('datetime64[ns]')] assert repr(ts) == ( '<Timeseries>(count=0, start=NaT, end=NaT, min=nan, max=nan)')
def test_timeseries_stats(self): index = [ dt.datetime(2017, 1, 5), dt.datetime(2017, 1, 1), dt.datetime(2017, 1, 8), dt.datetime(2017, 1, 3), ] data = [1, 2, 3, 4] df = pd.DataFrame({'data': data}, index=index) ts = Timeseries.from_dataframe(df) assert ts.stats() == { 'count': 4, 'start': dt.datetime(2017, 1, 1), 'end': dt.datetime(2017, 1, 8), 'update_ts': ts.dataframe[Timeseries.UPDATE_TIMESTAMP_COL].max() }
def generate_ts(number, random=False, nan=False): for _ in range(number): # Generate index t_start = dt.datetime(2017, 1, 1) t_end = dt.datetime(2018, 1, 1) index = pd.date_range(t_start, t_end, freq='D', closed='left') if nan: mask = np.random.choice([True, False], len(index)) index = index[mask] # Generate data if random: data = 100 * np.random.rand(len(index)) quality = np.random.randint(2, size=len(index)) else: data = range(len(index)) quality = [i % 2 for i in range(len(index))] yield Timeseries(index=index, data=data, quality=quality)
def test_hdfstore_timeseries_manager_persistance(self, tmpdir): """Ensure data is persistance accross manager instances""" t_start = dt.datetime(2017, 1, 1) t_end = dt.datetime(2017, 1, 2) index = pd.date_range(t_start, t_end, freq='min', closed='left') ts = Timeseries(index=index, data=np.random.rand(len(index)), quality=np.random.rand(len(index)), update_ts=index) mgr = HDFStoreTimeseriesMgr(str(tmpdir)) # Store new dataframe mgr.set('test', '/df', ts) df = mgr.get('test', '/df', t_start=t_start, t_end=t_end).dataframe assert len(df) == 60 * 24 del mgr mgr = HDFStoreTimeseriesMgr(str(tmpdir)) df = mgr.get('test', '/df', t_start=t_start, t_end=t_end).dataframe assert len(df) == 60 * 24
def test_timeseries_init_dataframe(self): t_start = dt.datetime(2017, 1, 1) t_end = dt.datetime(2017, 1, 6) index = pd.date_range(t_start, t_end, freq='D', closed='left') # index column must be a pandas.DatetimeIndex with pytest.raises(TimeseriesInvalidIndexTypeError): Timeseries(index=range(5), data=range(5)) # quality can not be set without data with pytest.raises(TimeseriesMissingColumnError): Timeseries(index=index, quality=np.random.rand(len(index))) # data can not be set without index with pytest.raises(TimeseriesMissingColumnError): Timeseries(data=range(len(index))) # validate dataframe ts = Timeseries(index=index, data=range(len(index))) # no custom column allowed ts.dataframe['dummy'] = range(len(index)) with pytest.raises(TimeseriesInvalidColumnsError): ts.validate() # droping a column unvalidates timeseries ts.dataframe.drop(columns=['dummy', 'data'], inplace=True) with pytest.raises(TimeseriesMissingColumnError): ts.validate() # repr timeseries ts_data = range(len(index)) ts = Timeseries(index=index, data=ts_data) assert repr(ts) == ( '<Timeseries>(count={}, start={}, end={}, min={}, max={})'.format( len(index), min(index), max(index), float(min(ts_data)), float(max(ts_data)))) # quality defaults to 1 ts = Timeseries(index=index, data=range(len(index))) assert (ts.dataframe['quality'] == 1).all() # update_ts can only be set using set_update_timestamp method ts = Timeseries( index=index, data=range(len(index)), quality=np.random.rand(len(index))) assert pd.isnull(ts.dataframe['update_ts']).all() # update with a pandas.Series ts.set_update_timestamp(index) assert ts.dataframe.at[index[0], 'update_ts'] == index.values[0] # update with a datetime t_update = dt.datetime.now() ts.set_update_timestamp(t_update) assert ts.dataframe.at[index[0], 'update_ts'] == t_update ts.validate() assert ts.dataframe.at[index[0], 'update_ts'] == t_update # update with a list of datetimes t_update = [dt.datetime.now()] * len(index) ts.set_update_timestamp(t_update) assert ts.dataframe.at[index[0], 'update_ts'] == t_update[0] ts.validate() assert ts.dataframe.at[index[0], 'update_ts'] == t_update[0] # data length must be equal to index length with pytest.raises(ValueError): Timeseries(index=index, data=range(len(index)+1))
def test_timeseries_convert(self): t_start = dt.datetime(2017, 1, 1) t_end = dt.datetime(2017, 1, 6) index = pd.date_range(t_start, t_end, freq='D', closed='left') data = range(len(index)) ts = Timeseries(index=index, data=data) # convert from celsius to fahreinheit ts.convert_unit('DegreeCelsius', 'DegreeFahrenheit') for idx, cur_val in enumerate(ts.dataframe[ts.DATA_COL].values): # as values are rounded, check strict equality assert cur_val == celsius_to_fahrenheit(data[idx]) # convert from celsius to fahreinheit, rounding to 4 decimals ts = Timeseries(index=index, data=data) ts.convert_unit('DegreeCelsius', 'DegreeFahrenheit', decimals=4) for idx, cur_val in enumerate(ts.dataframe[ts.DATA_COL].values): # as values are rounded, check strict equality assert cur_val == celsius_to_fahrenheit(data[idx], decimals=4) # convert from MWh to kWh ts.convert_unit('Megawatthour', 'Kilowatthour') # convert using square meters ts.convert_unit('WattSquareMeter', 'KilowattSquareMeter') # source and target units are not compatible with pytest.raises(TimeseriesUnitConversionError): ts.convert_unit('DegreeCelsius', 'Meter') # unknown source or target unit with pytest.raises(TimeseriesUnitConversionError): ts.convert_unit('inexistant', 'DegreeFahrenheit') with pytest.raises(TimeseriesUnitConversionError): ts.convert_unit('DegreeCelsius', 'inexistant') with pytest.raises(TimeseriesUnitConversionError): ts.convert_unit(None, 'DegreeFahrenheit') with pytest.raises(TimeseriesUnitConversionError): ts.convert_unit('DegreeCelsius', None) # Test custom units ts = Timeseries(index=index, data=data) ts.convert_unit('Percent', 'Permille') assert all( new == round(10 * old, 2) for (old, new) in zip(data, ts.dataframe[ts.DATA_COL].values)) ts.convert_unit('Permille', 'PartsPerMillion') assert all( new == round(10000 * old, 2) for (old, new) in zip(data, ts.dataframe[ts.DATA_COL].values)) ts.convert_unit('PartsPerMillion', 'Unitless') assert all( new == round(0.01 * old, 2) for (old, new) in zip(data, ts.dataframe[ts.DATA_COL].values))
def test_timeseries_tsload(self): timestamp_l = [ dt.datetime(2017, 1, 1) + dt.timedelta(n) for n in range(5) ] value_l = [0, 1, 2, 3, 4] quality_l = [1, 1, 0.5, None, 0.69] update_ts_l = [dt.datetime.now()] * 5 data_list = [] ts_df = tsload(data_list).dataframe assert isinstance(ts_df, pd.DataFrame) assert len(ts_df) == 0 assert ts_df.index.name == 'index' assert set(ts_df.columns.tolist()) == {'data', 'quality', 'update_ts'} assert ts_df.equals(Timeseries.empty_dataframe()) data_list = [{ 'timestamp': t, 'value': v } for t, v in zip(timestamp_l, value_l)] ts_df = tsload(data_list).dataframe assert isinstance(ts_df, pd.DataFrame) assert ts_df.index.tolist() == timestamp_l assert ts_df[Timeseries.DATA_COL].tolist() == value_l assert set(ts_df.columns.tolist()) == { Timeseries.DATA_COL, Timeseries.QUALITY_COL, Timeseries.UPDATE_TIMESTAMP_COL } assert ts_df[Timeseries.DATA_COL].dtype == np.dtype('float') data_list = [{ 'timestamp': t, 'value': v, 'quality': q } for t, v, q in zip(timestamp_l, value_l, quality_l)] ts_df = tsload(data_list).dataframe assert isinstance(ts_df, pd.DataFrame) assert ts_df.index.tolist() == timestamp_l assert ts_df[Timeseries.DATA_COL].tolist() == value_l qual_s = ts_df[Timeseries.QUALITY_COL] assert qual_s.where(pd.notnull(qual_s), None).tolist() == quality_l assert set(ts_df.columns.tolist()) == { Timeseries.DATA_COL, Timeseries.QUALITY_COL, Timeseries.UPDATE_TIMESTAMP_COL } assert ts_df[Timeseries.DATA_COL].dtype == np.dtype('float') assert ts_df[Timeseries.QUALITY_COL].dtype == np.dtype('float') data_list = [{ 'timestamp': t, 'value': v, 'quality': q, 'update_ts': u } for t, v, q, u in zip(timestamp_l, value_l, quality_l, update_ts_l)] ts_df = tsload(data_list).dataframe assert isinstance(ts_df, pd.DataFrame) assert ts_df.index.tolist() == timestamp_l assert ts_df[Timeseries.DATA_COL].tolist() == value_l qual_s = ts_df[Timeseries.QUALITY_COL] assert qual_s.where(pd.notnull(qual_s), None).tolist() == quality_l assert (ts_df[Timeseries.UPDATE_TIMESTAMP_COL].tolist() == update_ts_l) assert set(ts_df.columns.tolist()) == { Timeseries.DATA_COL, Timeseries.QUALITY_COL, Timeseries.UPDATE_TIMESTAMP_COL } assert ts_df[Timeseries.DATA_COL].dtype == np.dtype('float') assert ts_df[Timeseries.QUALITY_COL].dtype == np.dtype('float') assert (ts_df[Timeseries.UPDATE_TIMESTAMP_COL].dtype == np.dtype( 'datetime64[ns]'))
def test_hdfstore_timeseries_manager(self, tmpdir): t_before_start_1 = dt.datetime(2016, 12, 1) t_before_start_2 = dt.datetime(2016, 12, 2) t_start = dt.datetime(2017, 1, 1) t_inside_1 = dt.datetime(2017, 1, 1, 8, 12, 42) t_inside_2 = dt.datetime(2017, 1, 1, 16, 42, 12) t_end = dt.datetime(2017, 1, 2) t_after_end_1 = dt.datetime(2017, 2, 1) t_after_end_2 = dt.datetime(2017, 2, 2) index = pd.date_range(t_start, t_end, freq='min', closed='left') mgr = HDFStoreTimeseriesMgr(str(tmpdir)) # Get unknown timestore ID df = mgr.get('test', 'dummy', t_start=t_start, t_end=t_end).dataframe assert df.empty # Set unexisting ID: no problem ts = Timeseries(index=index, data=np.random.rand(len(index)), quality=np.random.rand(len(index)), update_ts=index) mgr.set('test', '/df', ts) df = ts.dataframe # Query new dataframe new_df = mgr.get('test', '/df', t_start=t_start, t_end=t_end).dataframe assert new_df['data'].equals(df['data']) assert new_df.index.equals(df.index) # Query new dataframe out of bounds new_df = mgr.get('test', '/df', t_start=t_before_start_1, t_end=t_before_start_2).dataframe assert new_df.empty new_df = mgr.get('test', '/df', t_start=t_after_end_1, t_end=t_after_end_2).dataframe assert new_df.empty # Query sub-timerange new_df = mgr.get('test', '/df', t_start=t_inside_1, t_end=t_inside_2).dataframe assert len(new_df) == 510 # Query straddling a bound new_df = mgr.get('test', '/df', t_start=t_before_start_2, t_end=t_inside_1).dataframe assert len(new_df) == 493 # Override some values ts = Timeseries(index=index[:5], data=np.arange(5, dtype='float'), quality=np.random.rand(5), update_ts=index[:5]) mgr.set('test', '/df', ts) new_df = mgr.get('test', '/df', t_start=t_start, t_end=t_end).dataframe assert len(new_df) == 1440 new_df = mgr.get('test', '/df', t_start=index[0], t_end=index[5]).dataframe assert new_df['data'].tolist() == [float(x) for x in range(5)] # Delete values mgr.delete('test', '/df', index[5], index[10]) new_df = mgr.get('test', '/df', t_start=t_start, t_end=t_end).dataframe assert len(new_df) == 1435 new_df = mgr.get('test', '/df', t_start=index[0], t_end=index[10]).dataframe assert new_df['data'].tolist() == [float(x) for x in range(5)]