def test_nested_postprocessors(self): df = pd.DataFrame({ "a": np.arange(20), "b": np.arange(20), "c": np.arange(20) }) def outer_postprocessor(df): return df[["b"]] fl: FeaturesWithLabels = df._.extract( PostProcessedFeaturesAndLabels.from_features_and_labels( PostProcessedFeaturesAndLabels( features=["a"], feature_post_processor=[ lambda df: lag_columns(df, [1, 2]) ], labels=["b", "c"], labels_post_processor=[ lambda df: df + 0.5, lambda df: df + 0.001 ], ), labels_post_processor=[outer_postprocessor], )) self.assertEqual((18, 1), fl.labels.shape) self.assertEqual(19.501, fl.labels.iloc[-1, -1])
def test_make_model(self): notebooks_path = os.path.join(PWD, '..', 'examples') df = pd.read_csv(os.path.join(notebooks_path, 'SPY.csv')) with df.model("/tmp/pijsfnwuacpa.model") as m: from torch import nn from torch.optim import SGD from pandas_ml_common.utils.column_lagging_utils import lag_columns from pandas_ml_utils import FeaturesAndLabels, RegressionSummary, FittingParameter from pandas_ml_utils_torch import PytorchModel from pandas_ml_utils_torch.merging_cross_folds import take_the_best def net_provider(): from pandas_ml_utils_torch import PytorchNN class Net(PytorchNN): def __init__(self): super().__init__() self.net = nn.Sequential( nn.Linear(10, 4), nn.Tanh(), nn.Linear(4, 4), nn.Tanh(), nn.Linear(4, 1), nn.Tanh(), ) def L1(self): # path to the parameters which should be regularized # the path is constructed from self.named_parameters() and allows the use of wildcards return {'net/0/**/weight': 0.02} def L2(self): return { 'net/0/**/weight': 0.02, 'net/2/**/weight': 0.05 } def forward_training(self, x): return self.net(x) return Net() fit = m.fit( PytorchModel( net_provider, FeaturesAndLabels( [lambda df: lag_columns(df["Close"].pct_change(), range(10))], [lambda df: df["Close"].pct_change().shift(-1)]), nn.MSELoss, lambda params: SGD(params, lr=0.01, momentum=0.0), merge_cross_folds=take_the_best, summary_provider=RegressionSummary ), FittingParameter(epochs=2), verbose=1 )
def test_auto_regression_simple(self): df = TEST_DF.copy() rnn = lag_columns(df[["Close", "Volume"]], [1, 2]) np.testing.assert_array_almost_equal( np.array([[ df["Close"].iloc[-2], df["Close"].iloc[-3], df["Volume"].iloc[-2], df["Volume"].iloc[-3] ]]), rnn[-1:].values, 0.0001)
def test_post_processing_multiindex_row(self): df = pd.DataFrame({"a": np.arange(20), "b": np.arange(20)}) df.index = pd.MultiIndex.from_product([["A", "B"], range(10)]) fl: FeaturesWithLabels = df._.extract( PostProcessedFeaturesAndLabels( features=["a"], feature_post_processor=lambda df: lag_columns(df, [1, 2]), labels=["b"], labels_post_processor=lambda df: df * 2, )) self.assertIsInstance(fl.features.index, pd.MultiIndex) self.assertIsInstance(fl.labels.index, pd.MultiIndex)
def ta_rnn( df: Union[pd.Series, pd.DataFrame], feature_lags: Iterable[int], lag_smoothing: Dict[int, Callable[[pd.Series], pd.Series]] = None, return_min_required_samples=False ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, int]]: # lag columns dff = lag_columns(df, feature_lags, lag_smoothing) # drop all rows which got nan now dff = dff.dropna() if return_min_required_samples: return dff, len(df) - len(dff) else: return dff
def test_lag_smoothing_nan(self): """given""" df = pd.DataFrame({"featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) # 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] # original # 1, 2, 3, 4, 5, 6, 7, 8, 9] # lag 1 # 1, 2, 3, 4, 5, 6, 7] # lag 1 + shift 2 # ^ # this is where the df starts """when lag smoothing is enabled using shift (which is introducing nan into the data frame)""" rnn = lag_columns(df[["featureA"]], feature_lags=[0, 1], lag_smoothing={ 1: lambda df: df["featureA"].shift(2) }).dropna() """then""" self.assertAlmostEqual(rnn[0, "featureA"].iloc[0], 4) self.assertAlmostEqual(rnn[1, "featureA"].iloc[0], 1.0) self.assertAlmostEqual(rnn[0, "featureA"].iloc[-1], 10) self.assertAlmostEqual(rnn[1, "featureA"].iloc[-1], 7.0)
def test_auto_regression(self): df = TEST_DF.copy() rnn = lag_columns(df[["Close", "Volume"]], [1, 2]).dropna() self.assertEqual(6823, len(rnn))
def test_soft_dtw_loss(self): df = TEST_DF[["Close"]][-21:].copy() class LstmAutoEncoder(PytorchNN): def __init__(self): super().__init__() self.input_size = 1 self.seq_size = 10 self.hidden_size = 2 self.num_layers = 1 self.num_directions = 1 self._encoder =\ nn.RNN(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=self.num_layers, batch_first=True) self._decoder =\ nn.RNN(input_size=self.hidden_size, hidden_size=self.input_size, num_layers=self.num_layers, batch_first=True) def forward_training(self, x): # make sure to treat single elements as batches x = x.view(-1, self.seq_size, self.input_size) batch_size = len(x) hidden_encoder = nn.Parameter( t.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size)) hidden_decoder = nn.Parameter( t.zeros(self.num_layers * self.num_directions, batch_size, self.input_size)) x, _ = self._encoder(x, hidden_encoder) x = t.repeat_interleave(x[:, -2:-1], x.shape[1], dim=1) x, hidden = self._decoder(x, hidden_decoder) return x.squeeze() def encode(self, x): x = x.reshape(-1, self.seq_size, self.input_size) batch_size = len(x) with t.no_grad(): hidden = nn.Parameter( t.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size)) # return last element of sequence return self._encoder(x, hidden)[0][:, -1] def decode(self, x): x = x.reshape(-1, self.seq_size, self.hidden_size) batch_size = len(x) with t.no_grad(): hidden = nn.Parameter( t.zeros(self.num_layers * self.num_directions, batch_size, self.input_size)) return self._decoder(x.float(), hidden)[0] model = PytorchAutoEncoderModel( LstmAutoEncoder, PostProcessedFeaturesAndLabels( df.columns.to_list(), [lambda df: lag_columns(df, 10).dropna()], df.columns.to_list(), [lambda df: lag_columns(df, 10).dropna()], ["condensed-a", "condensed-b"]), SoftDTW, Adam) with df.model() as m: fit = m.fit(model, FittingParameter(epochs=100)) print(fit.test_summary.df) encoded = df.model.predict(fit.model.as_encoder()) print(encoded)