def test_forecast_tcmf_distributed(self): input = dict({'id': self.id, 'y': self.data}) from zoo.orca import init_orca_context, stop_orca_context init_orca_context(cores=4, spark_log_level="INFO", init_ray_on_spark=True, object_store_memory="1g") self.model.fit(input, num_workers=4, **self.fit_params) with tempfile.TemporaryDirectory() as tempdirname: self.model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, is_xshards_distributed=False) yhat = self.model.predict(horizon=self.horizon, num_workers=4) yhat_loaded = loaded_model.predict(horizon=self.horizon, num_workers=4) yhat_id = yhat_loaded["id"] np.testing.assert_equal(yhat_id, self.id) yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (self.num_samples, self.horizon) np.testing.assert_equal(yhat, yhat_loaded) self.model.fit_incremental({'y': self.data_new}) yhat_incr = self.model.predict(horizon=self.horizon) yhat_incr = yhat_incr["prediction"] assert yhat_incr.shape == (self.num_samples, self.horizon) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, yhat, yhat_incr) target_value = dict({"y": self.data_new}) assert self.model.evaluate(target_value=target_value, metric=['mse']) stop_orca_context()
def test_forecast_tcmf_ndarray(self): ndarray_input = {'id': self.id, 'y': self.data} self.model.fit(ndarray_input, **self.fit_params) assert not self.model.is_xshards_distributed() # test predict yhat = self.model.predict(horizon=self.horizon) # test save load with tempfile.TemporaryDirectory() as tempdirname: self.model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, is_xshards_distributed=False) yhat_loaded = loaded_model.predict(horizon=self.horizon) yhat_id = yhat_loaded["id"] np.testing.assert_equal(yhat_id, self.id) yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (self.num_samples, self.horizon) np.testing.assert_array_almost_equal(yhat, yhat_loaded, decimal=4) # test evaluate target_value = dict({"y": self.data_new}) assert self.model.evaluate(target_value=target_value, metric=['mse']) # test fit_incremental self.model.fit_incremental({'y': self.data_new}) # 1st time self.model.fit_incremental({'y': self.data_new}) # 2nd time yhat_incr = self.model.predict(horizon=self.horizon) yhat_incr = yhat_incr["prediction"] assert yhat_incr.shape == (self.num_samples, self.horizon) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, yhat, yhat_incr)
def setUp(self): self.model = TCMFForecaster() self.num_samples = 300 self.horizon = np.random.randint(1, 50) self.seq_len = 480 self.data = np.random.rand(self.num_samples, self.seq_len) self.id = np.arange(self.num_samples) self.data_new = np.random.rand(self.num_samples, self.horizon) self.fit_params = dict(val_len=12, start_date="2020-1-1", freq="5min", y_iters=1, init_FX_epoch=1, max_FX_epoch=1, max_TCN_epoch=1, alt_iters=2)
def test_forecast_tcmf_without_id(self): # construct data input = dict({'y': self.data}) self.model.fit(input, **self.fit_params) assert not self.model.is_xshards_distributed() with tempfile.TemporaryDirectory() as tempdirname: self.model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, is_xshards_distributed=False) yhat = self.model.predict(horizon=self.horizon) yhat_loaded = loaded_model.predict(horizon=self.horizon) assert "id" not in yhat_loaded yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (self.num_samples, self.horizon) np.testing.assert_array_almost_equal(yhat, yhat_loaded, decimal=4) target_value = dict({"y": self.data_new}) self.model.evaluate(target_value=target_value, metric=['mse']) self.model.fit_incremental({'y': self.data_new}) # 1st time yhat_incr = self.model.predict(horizon=self.horizon) yhat_incr = yhat_incr["prediction"] assert yhat_incr.shape == (self.num_samples, self.horizon) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, yhat, yhat_incr) data_new_id = {'id': self.id, 'y': self.data_new} with self.assertRaises(ValueError) as context: self.model.fit_incremental(data_new_id) self.assertTrue('Got valid id in fit_incremental and invalid id in fit.' in str(context.exception))
def test_tcmf_ndarray_covariates_dti(self): ndarray_input = {'id': self.id, 'y': self.data} self.model.fit(ndarray_input, covariates=np.random.rand(3, self.seq_len), dti=pd.date_range('20130101', periods=self.seq_len), **self.fit_params) future_covariates = np.random.randn(3, self.horizon) future_dti = pd.date_range('20130101', periods=self.horizon) # test predict yhat = self.model.predict(horizon=self.horizon, future_covariates=future_covariates, future_dti=future_dti, ) # test save load with tempfile.TemporaryDirectory() as tempdirname: self.model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, is_xshards_distributed=False) yhat_loaded = loaded_model.predict(horizon=self.horizon, future_covariates=future_covariates, future_dti=future_dti, ) yhat_id = yhat_loaded["id"] np.testing.assert_equal(yhat_id, self.id) yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (self.num_samples, self.horizon) np.testing.assert_array_almost_equal(yhat, yhat_loaded, decimal=4) # test evaluate target_value = dict({"y": self.data_new}) assert self.model.evaluate(target_value=target_value, target_covariates=future_covariates, target_dti=future_dti, metric=['mse']) # test fit_incremental self.model.fit_incremental({'y': self.data_new}, covariates_incr=future_covariates, dti_incr=future_dti,) yhat_incr = self.model.predict(horizon=self.horizon, future_covariates=future_covariates, future_dti=future_dti, ) yhat_incr = yhat_incr["prediction"] assert yhat_incr.shape == (self.num_samples, self.horizon) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, yhat, yhat_incr)
class TestChronosModelTCMFForecaster(TestCase): def setUp(self): self.model = TCMFForecaster() self.num_samples = 300 self.horizon = np.random.randint(1, 50) self.seq_len = 480 self.data = np.random.rand(self.num_samples, self.seq_len) self.id = np.arange(self.num_samples) self.data_new = np.random.rand(self.num_samples, self.horizon) self.fit_params = dict(val_len=12, start_date="2020-1-1", freq="5min", y_iters=1, init_FX_epoch=1, max_FX_epoch=1, max_TCN_epoch=1, alt_iters=2) def test_forecast_tcmf_ndarray(self): ndarray_input = {'id': self.id, 'y': self.data} self.model.fit(ndarray_input, **self.fit_params) assert not self.model.is_xshards_distributed() # test predict yhat = self.model.predict(horizon=self.horizon) # test save load with tempfile.TemporaryDirectory() as tempdirname: self.model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, is_xshards_distributed=False) yhat_loaded = loaded_model.predict(horizon=self.horizon) yhat_id = yhat_loaded["id"] np.testing.assert_equal(yhat_id, self.id) yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (self.num_samples, self.horizon) np.testing.assert_array_almost_equal(yhat, yhat_loaded, decimal=4) # test evaluate target_value = dict({"y": self.data_new}) assert self.model.evaluate(target_value=target_value, metric=['mse']) # test fit_incremental self.model.fit_incremental({'y': self.data_new}) # 1st time self.model.fit_incremental({'y': self.data_new}) # 2nd time yhat_incr = self.model.predict(horizon=self.horizon) yhat_incr = yhat_incr["prediction"] assert yhat_incr.shape == (self.num_samples, self.horizon) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, yhat, yhat_incr) def test_tcmf_ndarray_covariates_dti(self): ndarray_input = {'id': self.id, 'y': self.data} self.model.fit(ndarray_input, covariates=np.random.rand(3, self.seq_len), dti=pd.date_range('20130101', periods=self.seq_len), **self.fit_params) future_covariates = np.random.randn(3, self.horizon) future_dti = pd.date_range('20130101', periods=self.horizon) # test predict yhat = self.model.predict(horizon=self.horizon, future_covariates=future_covariates, future_dti=future_dti, ) # test save load with tempfile.TemporaryDirectory() as tempdirname: self.model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, is_xshards_distributed=False) yhat_loaded = loaded_model.predict(horizon=self.horizon, future_covariates=future_covariates, future_dti=future_dti, ) yhat_id = yhat_loaded["id"] np.testing.assert_equal(yhat_id, self.id) yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (self.num_samples, self.horizon) np.testing.assert_array_almost_equal(yhat, yhat_loaded, decimal=4) # test evaluate target_value = dict({"y": self.data_new}) assert self.model.evaluate(target_value=target_value, target_covariates=future_covariates, target_dti=future_dti, metric=['mse']) # test fit_incremental self.model.fit_incremental({'y': self.data_new}, covariates_incr=future_covariates, dti_incr=future_dti,) yhat_incr = self.model.predict(horizon=self.horizon, future_covariates=future_covariates, future_dti=future_dti, ) yhat_incr = yhat_incr["prediction"] assert yhat_incr.shape == (self.num_samples, self.horizon) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, yhat, yhat_incr) def test_forecast_ndarray_error(self): # is_xshards_distributed with self.assertRaises(Exception) as context: self.model.is_xshards_distributed() self.assertTrue('You should run fit before calling is_xshards_distributed()' in str(context.exception)) # fit input = dict({'data': self.data}) with self.assertRaises(Exception) as context: self.model.fit(input) self.assertTrue("key `y` doesn't exist in x" in str(context.exception)) input = dict({'y': "abc"}) with self.assertRaises(Exception) as context: self.model.fit(input) self.assertTrue("the value of y should be an ndarray" in str(context.exception)) id_diff = np.arange(200) input = dict({'id': id_diff, 'y': self.data}) with self.assertRaises(Exception) as context: self.model.fit(input) self.assertTrue("the length of the id array should be equal to the number of" in str(context.exception)) input_right = dict({'id': self.id, 'y': self.data}) self.model.fit(input_right, **self.fit_params) with self.assertRaises(Exception) as context: self.model.fit(input_right) self.assertTrue('This model has already been fully trained' in str(context.exception)) # fit_incremental data_id_diff = {'id': self.id - 1, 'y': self.data_new} with self.assertRaises(ValueError) as context: self.model.fit_incremental(data_id_diff) self.assertTrue('The input ids in fit_incremental differs from input ids in fit' in str(context.exception)) # evaluate target_value_fake = dict({"data": self.data_new}) with self.assertRaises(Exception) as context: self.model.evaluate(target_value=target_value_fake, metric=['mse']) self.assertTrue("key `y` doesn't exist in x" in str(context.exception)) def test_forecast_tcmf_without_id(self): # construct data input = dict({'y': self.data}) self.model.fit(input, **self.fit_params) assert not self.model.is_xshards_distributed() with tempfile.TemporaryDirectory() as tempdirname: self.model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, is_xshards_distributed=False) yhat = self.model.predict(horizon=self.horizon) yhat_loaded = loaded_model.predict(horizon=self.horizon) assert "id" not in yhat_loaded yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (self.num_samples, self.horizon) np.testing.assert_array_almost_equal(yhat, yhat_loaded, decimal=4) target_value = dict({"y": self.data_new}) self.model.evaluate(target_value=target_value, metric=['mse']) self.model.fit_incremental({'y': self.data_new}) # 1st time yhat_incr = self.model.predict(horizon=self.horizon) yhat_incr = yhat_incr["prediction"] assert yhat_incr.shape == (self.num_samples, self.horizon) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, yhat, yhat_incr) data_new_id = {'id': self.id, 'y': self.data_new} with self.assertRaises(ValueError) as context: self.model.fit_incremental(data_new_id) self.assertTrue('Got valid id in fit_incremental and invalid id in fit.' in str(context.exception)) def test_forecast_tcmf_xshards(self): from zoo.orca import OrcaContext import zoo.orca.data.pandas import pandas as pd OrcaContext.pandas_read_backend = "pandas" def preprocessing(df, id_name, y_name): id = df.index data = df.to_numpy() result = dict({id_name: id, y_name: data}) return result def postprocessing(pred_results, output_dt_col_name): id_arr = pred_results["id"] pred_results = pred_results["prediction"] pred_results = np.concatenate((np.expand_dims(id_arr, axis=1), pred_results), axis=1) final_df = pd.DataFrame(pred_results, columns=["id"] + output_dt_col_name) final_df.id = final_df.id.astype("int") final_df = final_df.set_index("id") final_df.columns.name = "datetime" final_df = final_df.unstack().reset_index().rename({0: "prediction"}, axis=1) return final_df def get_pred(d): return d["prediction"] with tempfile.NamedTemporaryFile() as temp: data = np.random.rand(300, 480) df = pd.DataFrame(data) df.to_csv(temp.name) shard = zoo.orca.data.pandas.read_csv(temp.name) shard.cache() shard_train = shard.transform_shard(preprocessing, 'id', 'data') with self.assertRaises(Exception) as context: self.model.fit(shard_train) self.assertTrue("key `y` doesn't exist in x" in str(context.exception)) shard_train = shard.transform_shard(preprocessing, 'cid', 'y') with self.assertRaises(Exception) as context: self.model.fit(shard_train) self.assertTrue("key `id` doesn't exist in x" in str(context.exception)) with self.assertRaises(Exception) as context: self.model.is_xshards_distributed() self.assertTrue('You should run fit before calling is_xshards_distributed()' in str(context.exception)) shard_train = shard.transform_shard(preprocessing, 'id', 'y') self.model.fit(shard_train, **self.fit_params) assert self.model.is_xshards_distributed() with self.assertRaises(Exception) as context: self.model.fit(shard_train) self.assertTrue('This model has already been fully trained' in str(context.exception)) with self.assertRaises(Exception) as context: self.model.fit_incremental(shard_train) self.assertTrue('NotImplementedError' in context.exception.__class__.__name__) with tempfile.TemporaryDirectory() as tempdirname: self.model.save(tempdirname + "/model") loaded_model = TCMFForecaster.load(tempdirname + "/model", is_xshards_distributed=True) horizon = np.random.randint(1, 50) yhat_shard_origin = self.model.predict(horizon=horizon) yhat_list_origin = yhat_shard_origin.collect() yhat_list_origin = list(map(get_pred, yhat_list_origin)) yhat_shard = loaded_model.predict(horizon=horizon) yhat_list = yhat_shard.collect() yhat_list = list(map(get_pred, yhat_list)) yhat_origin = np.concatenate(yhat_list_origin) yhat = np.concatenate(yhat_list) assert yhat.shape == (300, horizon) np.testing.assert_equal(yhat, yhat_origin) output_dt_col_name = pd.date_range(start='2020-05-01', periods=horizon, freq='H').to_list() yhat_df_shards = yhat_shard.transform_shard(postprocessing, output_dt_col_name) final_df_list = yhat_df_shards.collect() final_df = pd.concat(final_df_list) final_df.sort_values("datetime", inplace=True) assert final_df.shape == (300 * horizon, 3) OrcaContext.pandas_read_backend = "spark" def test_forecast_tcmf_distributed(self): input = dict({'id': self.id, 'y': self.data}) from zoo.orca import init_orca_context, stop_orca_context init_orca_context(cores=4, spark_log_level="INFO", init_ray_on_spark=True, object_store_memory="1g") self.model.fit(input, num_workers=4, **self.fit_params) with tempfile.TemporaryDirectory() as tempdirname: self.model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, is_xshards_distributed=False) yhat = self.model.predict(horizon=self.horizon, num_workers=4) yhat_loaded = loaded_model.predict(horizon=self.horizon, num_workers=4) yhat_id = yhat_loaded["id"] np.testing.assert_equal(yhat_id, self.id) yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (self.num_samples, self.horizon) np.testing.assert_equal(yhat, yhat_loaded) self.model.fit_incremental({'y': self.data_new}) yhat_incr = self.model.predict(horizon=self.horizon) yhat_incr = yhat_incr["prediction"] assert yhat_incr.shape == (self.num_samples, self.horizon) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, yhat, yhat_incr) target_value = dict({"y": self.data_new}) assert self.model.evaluate(target_value=target_value, metric=['mse']) stop_orca_context()
def test_forecast_tcmf_xshards(self): from zoo.orca import OrcaContext import zoo.orca.data.pandas import pandas as pd OrcaContext.pandas_read_backend = "pandas" def preprocessing(df, id_name, y_name): id = df.index data = df.to_numpy() result = dict({id_name: id, y_name: data}) return result def postprocessing(pred_results, output_dt_col_name): id_arr = pred_results["id"] pred_results = pred_results["prediction"] pred_results = np.concatenate((np.expand_dims(id_arr, axis=1), pred_results), axis=1) final_df = pd.DataFrame(pred_results, columns=["id"] + output_dt_col_name) final_df.id = final_df.id.astype("int") final_df = final_df.set_index("id") final_df.columns.name = "datetime" final_df = final_df.unstack().reset_index().rename({0: "prediction"}, axis=1) return final_df def get_pred(d): return d["prediction"] with tempfile.NamedTemporaryFile() as temp: data = np.random.rand(300, 480) df = pd.DataFrame(data) df.to_csv(temp.name) shard = zoo.orca.data.pandas.read_csv(temp.name) shard.cache() shard_train = shard.transform_shard(preprocessing, 'id', 'data') with self.assertRaises(Exception) as context: self.model.fit(shard_train) self.assertTrue("key `y` doesn't exist in x" in str(context.exception)) shard_train = shard.transform_shard(preprocessing, 'cid', 'y') with self.assertRaises(Exception) as context: self.model.fit(shard_train) self.assertTrue("key `id` doesn't exist in x" in str(context.exception)) with self.assertRaises(Exception) as context: self.model.is_xshards_distributed() self.assertTrue('You should run fit before calling is_xshards_distributed()' in str(context.exception)) shard_train = shard.transform_shard(preprocessing, 'id', 'y') self.model.fit(shard_train, **self.fit_params) assert self.model.is_xshards_distributed() with self.assertRaises(Exception) as context: self.model.fit(shard_train) self.assertTrue('This model has already been fully trained' in str(context.exception)) with self.assertRaises(Exception) as context: self.model.fit_incremental(shard_train) self.assertTrue('NotImplementedError' in context.exception.__class__.__name__) with tempfile.TemporaryDirectory() as tempdirname: self.model.save(tempdirname + "/model") loaded_model = TCMFForecaster.load(tempdirname + "/model", is_xshards_distributed=True) horizon = np.random.randint(1, 50) yhat_shard_origin = self.model.predict(horizon=horizon) yhat_list_origin = yhat_shard_origin.collect() yhat_list_origin = list(map(get_pred, yhat_list_origin)) yhat_shard = loaded_model.predict(horizon=horizon) yhat_list = yhat_shard.collect() yhat_list = list(map(get_pred, yhat_list)) yhat_origin = np.concatenate(yhat_list_origin) yhat = np.concatenate(yhat_list) assert yhat.shape == (300, horizon) np.testing.assert_equal(yhat, yhat_origin) output_dt_col_name = pd.date_range(start='2020-05-01', periods=horizon, freq='H').to_list() yhat_df_shards = yhat_shard.transform_shard(postprocessing, output_dt_col_name) final_df_list = yhat_df_shards.collect() final_df = pd.concat(final_df_list) final_df.sort_values("datetime", inplace=True) assert final_df.shape == (300 * horizon, 3) OrcaContext.pandas_read_backend = "spark"
args = parser.parse_args() num_nodes = 1 if args.cluster_mode == "local" else args.num_workers init_orca_context(cluster_mode=args.cluster_mode, cores=args.cores, num_nodes=num_nodes, memory=args.memory, init_ray_on_spark=True) if not args.use_dummy_data: assert args.data_dir is not None, "--data_dir must be provided if not using dummy data" logger.info('Initalizing TCMFForecaster.') model = TCMFForecaster( vbsize=128, hbsize=256, num_channels_X=[32, 32, 32, 32, 32, 1], num_channels_Y=[32, 32, 32, 32, 32, 1], kernel_size=7, dropout=0.2, rank=64, kernel_size_Y=7, learning_rate=0.0005, normalize=False, use_time=True, svd=True, ) ymat = np.load(args.data_dir) if not args.use_dummy_data else get_dummy_data() horizon = 24 train_data = ymat[:, : -2 * horizon] target_data = ymat[:, -2 * horizon: -horizon] incr_target_data = ymat[:, -horizon:] logger.info('Start fitting.') model.fit({'y': train_data}, val_len=24,