def dfdict_combined_temporal_collocation(dfs, refname, k, window=None, n=None, **kwargs): """ Applies :py:func:`combined_temporal_collocation` on a dictionary of dataframes. Parameters ---------- dfs : dict Dictionary of pd.DataFrames containing the dataframes to be collocated. refname : str Name of the reference frame in `dfs`. k : int Number of columns that will be put together in the output dictionary. The output will consist of all combinations of size k. window : pd.Timedelta or float, optional Window around reference timestamps in which to look for data. Floats are interpreted as number of days. If it is not given, defaults to 1 hour to mimick the behaviour of ``BasicTemporalMatching.combinatory_matcher``. **kwargs : Keyword arguments passed to :py:func:`combined_temporal_collocation`. Returns: -------- matched_dict : dict Dictionary where the key is tuples of ``(refname, othernames...)``. """ if n is not None: if len(dfs) != n: return {} if window is None: window = pd.Timedelta(hours=1) others = [] for name in dfs: if name != refname: others.append(df_name_multiindex(dfs[name], name)) ref = df_name_multiindex(dfs[refname], refname) matched_df = temporal_matching.combined_temporal_collocation( ref, others, window, add_ref_data=True, **kwargs) # unpack again to dictionary matched_dict = {} othernames = list(dfs.keys()) othernames.remove(refname) key = tuple([refname] + othernames) matched_dict[key] = matched_df return matched_dict
def match(self, reference, *args): """ takes reference and other dataframe and returnes a joined Dataframe in this case the reference dataset for the grid is also the temporal reference dataset """ ref_df = pd.DataFrame(reference) return temporal_matching.combined_temporal_collocation( ref_df, args, self.window, dropna=True, dropduplicates=True, add_ref_data=True, combined_dropna="all")
def temporal_match(to_match, hours=6, drop_missing=False, **kwargs) -> pd.DataFrame: """ Temporal match to the longest timeseries Parameters ---------- to_match : list list of dataframes to match hours : int window to perform the temporal matching drop_missing : bool, optional. Default is False. If true, only time steps when all points have measurements are kept Returns ------- matched: pd.DataFrame dataframe with temporally matched timeseries """ # get time series with most points ref = to_match[0] for n, df in enumerate(to_match): if df is None: continue points = int(df.count()) if int(ref.count()) >= points: continue else: ref = df combined_dropna = False if drop_missing: combined_dropna = "any" matched = combined_temporal_collocation( ref, to_match, pd.Timedelta(hours, "H"), combined_dropna=combined_dropna, checkna=True, ) matched.dropna(axis="columns", how="all", inplace=True) return matched
def test_combined_timezones(): dr = pd.date_range("2000-01-01", "2000-01-31", freq="D") dr_utc = pd.date_range("2000-01-01", "2000-01-31", freq="D", tz="UTC") dr_berlin = pd.date_range("2000-01-01", "2000-01-31", freq="D", tz="Europe/Berlin") n = len(dr) # test timezone naive merged = tmatching.combined_temporal_collocation( pd.DataFrame(np.random.randn(n), index=dr), ( pd.DataFrame(np.random.randn(n), index=dr), pd.DataFrame(np.random.randn(n), index=dr), ), pd.Timedelta(6, "H"), add_ref_data=True, ) assert merged.index.tz is None # test with same timezone merged = tmatching.combined_temporal_collocation( pd.DataFrame(np.random.randn(n), index=dr_berlin), ( pd.DataFrame(np.random.randn(n), index=dr_berlin), pd.DataFrame(np.random.randn(n), index=dr_berlin), ), pd.Timedelta(6, "H"), add_ref_data=True, ) assert str(merged.index.tz) == "Europe/Berlin" # test with missing timezone with pytest.warns(UserWarning, match="No timezone given"): merged = tmatching.combined_temporal_collocation( pd.DataFrame(np.random.randn(n), index=dr), ( pd.DataFrame(np.random.randn(n), index=dr_berlin), pd.DataFrame(np.random.randn(n), index=dr), ), pd.Timedelta(6, "H"), add_ref_data=True, ) assert str(merged.index.tz) == "Europe/Berlin" # test with different timezones and no ref timezone with pytest.warns(UserWarning) as warn_record: merged = tmatching.combined_temporal_collocation( pd.DataFrame(np.random.randn(n), index=dr), ( pd.DataFrame(np.random.randn(n), index=dr_berlin), pd.DataFrame(np.random.randn(n), index=dr_utc), ), pd.Timedelta(6, "H"), add_ref_data=True, ) assert str(merged.index.tz) == "UTC" assert len(warn_record) == 3 assert "No timezone given" in warn_record[0].message.args[0] assert "Europe/Berlin" in warn_record[0].message.args[0] assert "No timezone given" in warn_record[1].message.args[0] assert "UTC" in warn_record[1].message.args[0] assert "mixed timezones" in warn_record[2].message.args[0] # test with different timezones and ref timezone merged = tmatching.combined_temporal_collocation( pd.DataFrame(np.random.randn(n), index=dr_berlin), ( pd.DataFrame(np.random.randn(n), index=dr_berlin), pd.DataFrame(np.random.randn(n), index=dr_utc), ), pd.Timedelta(6, "H"), add_ref_data=True, ) assert str(merged.index.tz) == "Europe/Berlin"
def test_combined_matching(): index = pd.DatetimeIndex([datetime(2007, 1, i + 1, 0) for i in range(10)]) ref = pd.DatetimeIndex([datetime(2007, 1, i + 1, 5) for i in range(10)]) data = { "data1": np.random.randn(10), "data2": np.random.randn(10), "missing": np.random.randn(10), } data["missing"][2] = np.nan frames = { name: pd.DataFrame({name: data[name]}, index=index) for name in data } # everything together merged = tmatching.combined_temporal_collocation( ref, (frames[name] for name in frames), pd.Timedelta(6, "H"), combined_dropna=False, ) assert len(merged) == 10 for name in frames: assert name in merged.columns nptest.assert_equal(merged[name].values.ravel(), frames[name].values.ravel()) # test with dropna but not combined_dropna merged = tmatching.combined_temporal_collocation( ref, (frames[name] for name in frames), pd.Timedelta(6, "H"), combined_dropna=False, dropna=True, ) assert len(merged) == 10 for name in frames: assert name in merged.columns nptest.assert_equal(merged[name].values.ravel(), frames[name].values.ravel()) # test with combined_dropna merged = tmatching.combined_temporal_collocation( ref, (frames[name] for name in frames), pd.Timedelta(6, "H"), combined_dropna="any", dropna=True, ) assert len(merged) == 9 for name in frames: assert name in merged.columns nptest.assert_equal(merged[name].values.ravel()[2:], frames[name].values.ravel()[3:]) # test with 2d-dataframe df2d = pd.DataFrame( { "2d1": np.random.randn(10), "2d2": np.random.randn(10) }, index=index) merged = tmatching.combined_temporal_collocation( ref, (frames["missing"], df2d), pd.Timedelta(6, "H"), combined_dropna=False, ) assert len(merged) == 10 # test without match for comb_drop in [True, False]: merged = tmatching.combined_temporal_collocation( ref, (frames["missing"], df2d), pd.Timedelta(1, "H"), combined_dropna=comb_drop, dropna=True, ) assert len(merged) == 0