示例#1
0
def dfdict_combined_temporal_collocation(dfs,
                                         refname,
                                         k,
                                         window=None,
                                         n=None,
                                         **kwargs):
    """
    Applies :py:func:`combined_temporal_collocation` on a dictionary of
    dataframes.

    Parameters
    ----------
    dfs : dict
        Dictionary of pd.DataFrames containing the dataframes to be collocated.
    refname : str
        Name of the reference frame in `dfs`.
    k : int
        Number of columns that will be put together in the output dictionary.
        The output will consist of all combinations of size k.
    window : pd.Timedelta or float, optional
        Window around reference timestamps in which to look for data. Floats
        are interpreted as number of days. If it is not given, defaults to 1
        hour to mimick the behaviour of
        ``BasicTemporalMatching.combinatory_matcher``.
    **kwargs :
        Keyword arguments passed to :py:func:`combined_temporal_collocation`.

    Returns:
    --------
    matched_dict : dict
        Dictionary where the key is tuples of ``(refname, othernames...)``.
    """
    if n is not None:
        if len(dfs) != n:
            return {}

    if window is None:
        window = pd.Timedelta(hours=1)

    others = []
    for name in dfs:
        if name != refname:
            others.append(df_name_multiindex(dfs[name], name))
    ref = df_name_multiindex(dfs[refname], refname)
    matched_df = temporal_matching.combined_temporal_collocation(
        ref, others, window, add_ref_data=True, **kwargs)

    # unpack again to dictionary
    matched_dict = {}
    othernames = list(dfs.keys())
    othernames.remove(refname)
    key = tuple([refname] + othernames)
    matched_dict[key] = matched_df
    return matched_dict
示例#2
0
 def match(self, reference, *args):
     """
     takes reference and other dataframe and returnes a joined Dataframe
     in this case the reference dataset for the grid is also the
     temporal reference dataset
     """
     ref_df = pd.DataFrame(reference)
     return temporal_matching.combined_temporal_collocation(
         ref_df,
         args,
         self.window,
         dropna=True,
         dropduplicates=True,
         add_ref_data=True,
         combined_dropna="all")
示例#3
0
    def temporal_match(to_match,
                       hours=6,
                       drop_missing=False,
                       **kwargs) -> pd.DataFrame:
        """
        Temporal match to the longest timeseries

        Parameters
        ----------
        to_match : list
            list of dataframes to match
        hours : int
            window to perform the temporal matching
        drop_missing : bool, optional. Default is False.
            If true, only time steps when all points have measurements
            are kept

        Returns
        -------
        matched: pd.DataFrame
            dataframe with temporally matched timeseries
        """
        # get time series with most points
        ref = to_match[0]
        for n, df in enumerate(to_match):
            if df is None:
                continue
            points = int(df.count())
            if int(ref.count()) >= points:
                continue
            else:
                ref = df

        combined_dropna = False
        if drop_missing:
            combined_dropna = "any"

        matched = combined_temporal_collocation(
            ref,
            to_match,
            pd.Timedelta(hours, "H"),
            combined_dropna=combined_dropna,
            checkna=True,
        )
        matched.dropna(axis="columns", how="all", inplace=True)

        return matched
def test_combined_timezones():
    dr = pd.date_range("2000-01-01", "2000-01-31", freq="D")
    dr_utc = pd.date_range("2000-01-01", "2000-01-31", freq="D", tz="UTC")
    dr_berlin = pd.date_range("2000-01-01",
                              "2000-01-31",
                              freq="D",
                              tz="Europe/Berlin")
    n = len(dr)

    # test timezone naive
    merged = tmatching.combined_temporal_collocation(
        pd.DataFrame(np.random.randn(n), index=dr),
        (
            pd.DataFrame(np.random.randn(n), index=dr),
            pd.DataFrame(np.random.randn(n), index=dr),
        ),
        pd.Timedelta(6, "H"),
        add_ref_data=True,
    )
    assert merged.index.tz is None

    # test with same timezone
    merged = tmatching.combined_temporal_collocation(
        pd.DataFrame(np.random.randn(n), index=dr_berlin),
        (
            pd.DataFrame(np.random.randn(n), index=dr_berlin),
            pd.DataFrame(np.random.randn(n), index=dr_berlin),
        ),
        pd.Timedelta(6, "H"),
        add_ref_data=True,
    )
    assert str(merged.index.tz) == "Europe/Berlin"

    # test with missing timezone
    with pytest.warns(UserWarning, match="No timezone given"):
        merged = tmatching.combined_temporal_collocation(
            pd.DataFrame(np.random.randn(n), index=dr),
            (
                pd.DataFrame(np.random.randn(n), index=dr_berlin),
                pd.DataFrame(np.random.randn(n), index=dr),
            ),
            pd.Timedelta(6, "H"),
            add_ref_data=True,
        )
        assert str(merged.index.tz) == "Europe/Berlin"

    # test with different timezones and no ref timezone
    with pytest.warns(UserWarning) as warn_record:
        merged = tmatching.combined_temporal_collocation(
            pd.DataFrame(np.random.randn(n), index=dr),
            (
                pd.DataFrame(np.random.randn(n), index=dr_berlin),
                pd.DataFrame(np.random.randn(n), index=dr_utc),
            ),
            pd.Timedelta(6, "H"),
            add_ref_data=True,
        )
        assert str(merged.index.tz) == "UTC"
    assert len(warn_record) == 3
    assert "No timezone given" in warn_record[0].message.args[0]
    assert "Europe/Berlin" in warn_record[0].message.args[0]
    assert "No timezone given" in warn_record[1].message.args[0]
    assert "UTC" in warn_record[1].message.args[0]
    assert "mixed timezones" in warn_record[2].message.args[0]

    # test with different timezones and ref timezone
    merged = tmatching.combined_temporal_collocation(
        pd.DataFrame(np.random.randn(n), index=dr_berlin),
        (
            pd.DataFrame(np.random.randn(n), index=dr_berlin),
            pd.DataFrame(np.random.randn(n), index=dr_utc),
        ),
        pd.Timedelta(6, "H"),
        add_ref_data=True,
    )
    assert str(merged.index.tz) == "Europe/Berlin"
def test_combined_matching():
    index = pd.DatetimeIndex([datetime(2007, 1, i + 1, 0) for i in range(10)])
    ref = pd.DatetimeIndex([datetime(2007, 1, i + 1, 5) for i in range(10)])

    data = {
        "data1": np.random.randn(10),
        "data2": np.random.randn(10),
        "missing": np.random.randn(10),
    }
    data["missing"][2] = np.nan
    frames = {
        name: pd.DataFrame({name: data[name]}, index=index)
        for name in data
    }

    # everything together
    merged = tmatching.combined_temporal_collocation(
        ref,
        (frames[name] for name in frames),
        pd.Timedelta(6, "H"),
        combined_dropna=False,
    )

    assert len(merged) == 10
    for name in frames:
        assert name in merged.columns
        nptest.assert_equal(merged[name].values.ravel(),
                            frames[name].values.ravel())

    # test with dropna but not combined_dropna
    merged = tmatching.combined_temporal_collocation(
        ref,
        (frames[name] for name in frames),
        pd.Timedelta(6, "H"),
        combined_dropna=False,
        dropna=True,
    )

    assert len(merged) == 10
    for name in frames:
        assert name in merged.columns
        nptest.assert_equal(merged[name].values.ravel(),
                            frames[name].values.ravel())

    # test with combined_dropna
    merged = tmatching.combined_temporal_collocation(
        ref,
        (frames[name] for name in frames),
        pd.Timedelta(6, "H"),
        combined_dropna="any",
        dropna=True,
    )

    assert len(merged) == 9
    for name in frames:
        assert name in merged.columns
        nptest.assert_equal(merged[name].values.ravel()[2:],
                            frames[name].values.ravel()[3:])

    # test with 2d-dataframe
    df2d = pd.DataFrame(
        {
            "2d1": np.random.randn(10),
            "2d2": np.random.randn(10)
        }, index=index)
    merged = tmatching.combined_temporal_collocation(
        ref,
        (frames["missing"], df2d),
        pd.Timedelta(6, "H"),
        combined_dropna=False,
    )
    assert len(merged) == 10

    # test without match
    for comb_drop in [True, False]:
        merged = tmatching.combined_temporal_collocation(
            ref,
            (frames["missing"], df2d),
            pd.Timedelta(1, "H"),
            combined_dropna=comb_drop,
            dropna=True,
        )
        assert len(merged) == 0