def test_nested_postprocessors(self):
        df = pd.DataFrame({
            "a": np.arange(20),
            "b": np.arange(20),
            "c": np.arange(20)
        })

        def outer_postprocessor(df):
            return df[["b"]]

        fl: FeaturesWithLabels = df._.extract(
            PostProcessedFeaturesAndLabels.from_features_and_labels(
                PostProcessedFeaturesAndLabels(
                    features=["a"],
                    feature_post_processor=[
                        lambda df: lag_columns(df, [1, 2])
                    ],
                    labels=["b", "c"],
                    labels_post_processor=[
                        lambda df: df + 0.5, lambda df: df + 0.001
                    ],
                ),
                labels_post_processor=[outer_postprocessor],
            ))

        self.assertEqual((18, 1), fl.labels.shape)
        self.assertEqual(19.501, fl.labels.iloc[-1, -1])
예제 #2
0
    def test_make_model(self):
        notebooks_path = os.path.join(PWD, '..', 'examples')
        df = pd.read_csv(os.path.join(notebooks_path, 'SPY.csv'))

        with df.model("/tmp/pijsfnwuacpa.model") as m:
            from torch import nn
            from torch.optim import SGD
            from pandas_ml_common.utils.column_lagging_utils import lag_columns

            from pandas_ml_utils import FeaturesAndLabels, RegressionSummary, FittingParameter
            from pandas_ml_utils_torch import PytorchModel
            from pandas_ml_utils_torch.merging_cross_folds import take_the_best

            def net_provider():
                from pandas_ml_utils_torch import PytorchNN

                class Net(PytorchNN):

                    def __init__(self):
                        super().__init__()
                        self.net = nn.Sequential(
                            nn.Linear(10, 4),
                            nn.Tanh(),
                            nn.Linear(4, 4),
                            nn.Tanh(),
                            nn.Linear(4, 1),
                            nn.Tanh(),
                        )

                    def L1(self):
                        # path to the parameters which should be regularized
                        # the path is constructed from self.named_parameters() and allows the use of wildcards
                        return {'net/0/**/weight': 0.02}

                    def L2(self):
                        return {
                            'net/0/**/weight': 0.02,
                            'net/2/**/weight': 0.05
                        }

                    def forward_training(self, x):
                        return self.net(x)

                return Net()

            fit = m.fit(
                PytorchModel(
                    net_provider,
                    FeaturesAndLabels(
                        [lambda df: lag_columns(df["Close"].pct_change(), range(10))],
                        [lambda df: df["Close"].pct_change().shift(-1)]),
                    nn.MSELoss,
                    lambda params: SGD(params, lr=0.01, momentum=0.0),
                    merge_cross_folds=take_the_best,
                    summary_provider=RegressionSummary
                ),
                FittingParameter(epochs=2),
                verbose=1
            )
예제 #3
0
    def test_auto_regression_simple(self):
        df = TEST_DF.copy()
        rnn = lag_columns(df[["Close", "Volume"]], [1, 2])

        np.testing.assert_array_almost_equal(
            np.array([[
                df["Close"].iloc[-2], df["Close"].iloc[-3],
                df["Volume"].iloc[-2], df["Volume"].iloc[-3]
            ]]), rnn[-1:].values, 0.0001)
    def test_post_processing_multiindex_row(self):
        df = pd.DataFrame({"a": np.arange(20), "b": np.arange(20)})
        df.index = pd.MultiIndex.from_product([["A", "B"], range(10)])

        fl: FeaturesWithLabels = df._.extract(
            PostProcessedFeaturesAndLabels(
                features=["a"],
                feature_post_processor=lambda df: lag_columns(df, [1, 2]),
                labels=["b"],
                labels_post_processor=lambda df: df * 2,
            ))

        self.assertIsInstance(fl.features.index, pd.MultiIndex)
        self.assertIsInstance(fl.labels.index, pd.MultiIndex)
예제 #5
0
def ta_rnn(
    df: Union[pd.Series, pd.DataFrame],
    feature_lags: Iterable[int],
    lag_smoothing: Dict[int, Callable[[pd.Series], pd.Series]] = None,
    return_min_required_samples=False
) -> Union[pd.DataFrame, Tuple[pd.DataFrame, int]]:

    # lag columns
    dff = lag_columns(df, feature_lags, lag_smoothing)

    # drop all rows which got nan now
    dff = dff.dropna()

    if return_min_required_samples:
        return dff, len(df) - len(dff)
    else:
        return dff
예제 #6
0
 def test_lag_smoothing_nan(self):
     """given"""
     df = pd.DataFrame({"featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
     #                               1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  # original
     #                                  1, 2, 3, 4, 5, 6, 7, 8, 9]   # lag 1
     #                                        1, 2, 3, 4, 5, 6, 7]   # lag 1 + shift 2
     #                                        ^                      # this is where the df starts
     """when lag smoothing is enabled using shift (which is introducing nan into the data frame)"""
     rnn = lag_columns(df[["featureA"]],
                       feature_lags=[0, 1],
                       lag_smoothing={
                           1: lambda df: df["featureA"].shift(2)
                       }).dropna()
     """then"""
     self.assertAlmostEqual(rnn[0, "featureA"].iloc[0], 4)
     self.assertAlmostEqual(rnn[1, "featureA"].iloc[0], 1.0)
     self.assertAlmostEqual(rnn[0, "featureA"].iloc[-1], 10)
     self.assertAlmostEqual(rnn[1, "featureA"].iloc[-1], 7.0)
예제 #7
0
 def test_auto_regression(self):
     df = TEST_DF.copy()
     rnn = lag_columns(df[["Close", "Volume"]], [1, 2]).dropna()
     self.assertEqual(6823, len(rnn))
예제 #8
0
    def test_soft_dtw_loss(self):
        df = TEST_DF[["Close"]][-21:].copy()

        class LstmAutoEncoder(PytorchNN):
            def __init__(self):
                super().__init__()
                self.input_size = 1
                self.seq_size = 10
                self.hidden_size = 2
                self.num_layers = 1
                self.num_directions = 1

                self._encoder =\
                    nn.RNN(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=self.num_layers,
                           batch_first=True)

                self._decoder =\
                    nn.RNN(input_size=self.hidden_size, hidden_size=self.input_size, num_layers=self.num_layers,
                           batch_first=True)

            def forward_training(self, x):
                # make sure to treat single elements as batches
                x = x.view(-1, self.seq_size, self.input_size)
                batch_size = len(x)

                hidden_encoder = nn.Parameter(
                    t.zeros(self.num_layers * self.num_directions, batch_size,
                            self.hidden_size))
                hidden_decoder = nn.Parameter(
                    t.zeros(self.num_layers * self.num_directions, batch_size,
                            self.input_size))

                x, _ = self._encoder(x, hidden_encoder)
                x = t.repeat_interleave(x[:, -2:-1], x.shape[1], dim=1)
                x, hidden = self._decoder(x, hidden_decoder)
                return x.squeeze()

            def encode(self, x):
                x = x.reshape(-1, self.seq_size, self.input_size)
                batch_size = len(x)

                with t.no_grad():
                    hidden = nn.Parameter(
                        t.zeros(self.num_layers * self.num_directions,
                                batch_size, self.hidden_size))

                    # return last element of sequence
                    return self._encoder(x, hidden)[0][:, -1]

            def decode(self, x):
                x = x.reshape(-1, self.seq_size, self.hidden_size)
                batch_size = len(x)

                with t.no_grad():
                    hidden = nn.Parameter(
                        t.zeros(self.num_layers * self.num_directions,
                                batch_size, self.input_size))
                    return self._decoder(x.float(), hidden)[0]

        model = PytorchAutoEncoderModel(
            LstmAutoEncoder,
            PostProcessedFeaturesAndLabels(
                df.columns.to_list(),
                [lambda df: lag_columns(df, 10).dropna()],
                df.columns.to_list(),
                [lambda df: lag_columns(df, 10).dropna()],
                ["condensed-a", "condensed-b"]), SoftDTW, Adam)

        with df.model() as m:
            fit = m.fit(model, FittingParameter(epochs=100))
            print(fit.test_summary.df)

            encoded = df.model.predict(fit.model.as_encoder())
            print(encoded)