def test_ndarray_local(self): ndarray_input = {'id': self.id, 'y': self.data} self.model.fit(ndarray_input) assert not self.model.is_xshards_distributed() # test predict yhat = self.model.predict(horizon=self.horizon) # test save load with tempfile.TemporaryDirectory() as tempdirname: self.model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, is_xshards_distributed=False) yhat_loaded = loaded_model.predict(horizon=self.horizon) yhat_id = yhat_loaded["id"] np.testing.assert_equal(yhat_id, self.id) yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (self.num_samples, self.horizon) np.testing.assert_array_almost_equal(yhat, yhat_loaded, decimal=4) # test evaluate target_value = dict({"y": self.data_new}) assert self.model.evaluate(target_value=target_value, metric=['mse']) # test fit_incremental self.model.fit_incremental({'y': self.data_new}) # 1st time self.model.fit_incremental({'y': self.data_new}) # 2nd time yhat_incr = self.model.predict(horizon=self.horizon) yhat_incr = yhat_incr["prediction"] assert yhat_incr.shape == (self.num_samples, self.horizon) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, yhat, yhat_incr)
def test_forecast_tcmf_without_id(self): # construct data input = dict({'y': self.data}) self.model.fit(input) assert not self.model.is_xshards_distributed() with tempfile.TemporaryDirectory() as tempdirname: self.model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, is_xshards_distributed=False) yhat = self.model.predict(horizon=self.horizon) yhat_loaded = loaded_model.predict(horizon=self.horizon) assert "id" not in yhat_loaded yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (self.num_samples, self.horizon) np.testing.assert_array_almost_equal(yhat, yhat_loaded, decimal=4) target_value = dict({"y": self.data_new}) self.model.evaluate(target_value=target_value, metric=['mse']) self.model.fit_incremental({'y': self.data_new}) # 1st time yhat_incr = self.model.predict(horizon=self.horizon) yhat_incr = yhat_incr["prediction"] assert yhat_incr.shape == (self.num_samples, self.horizon) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, yhat, yhat_incr) data_new_id = {'id': self.id, 'y': self.data_new} with self.assertRaises(ValueError) as context: self.model.fit_incremental(data_new_id) self.assertTrue( 'Got valid id in fit_incremental and invalid id in fit.' in str( context.exception))
def test_forecast_tcmf_distributed(self): input = dict({'id': self.id, 'y': self.data}) from zoo.orca import init_orca_context, stop_orca_context init_orca_context(cores=4, spark_log_level="INFO", init_ray_on_spark=True, object_store_memory="1g") self.model.fit(input, num_workers=4) with tempfile.TemporaryDirectory() as tempdirname: self.model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, is_xshards_distributed=False) yhat = self.model.predict(horizon=self.horizon, num_workers=4) yhat_loaded = loaded_model.predict(horizon=self.horizon, num_workers=4) yhat_id = yhat_loaded["id"] np.testing.assert_equal(yhat_id, self.id) yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (self.num_samples, self.horizon) np.testing.assert_equal(yhat, yhat_loaded) self.model.fit_incremental({'y': self.data_new}) yhat_incr = self.model.predict(horizon=self.horizon) yhat_incr = yhat_incr["prediction"] assert yhat_incr.shape == (self.num_samples, self.horizon) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, yhat, yhat_incr) target_value = dict({"y": self.data_new}) assert self.model.evaluate(target_value=target_value, metric=['mse']) stop_orca_context()
def test_forecast_tcmf_distributed(self): model = TCMFForecaster(y_iters=1, init_FX_epoch=1, max_FX_epoch=1, max_TCN_epoch=1, alt_iters=2) horizon = np.random.randint(1, 50) # construct data id = np.arange(300) data = np.random.rand(300, 480) input = dict({'id': id, 'y': data}) from zoo.orca import init_orca_context, stop_orca_context init_orca_context(cores=4, spark_log_level="INFO", init_ray_on_spark=True, object_store_memory="1g") model.fit(input, num_workers=4) with tempfile.TemporaryDirectory() as tempdirname: model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, distributed=False) yhat = model.predict(x=None, horizon=horizon, num_workers=4) yhat_loaded = loaded_model.predict(x=None, horizon=horizon, num_workers=4) yhat_id = yhat_loaded["id"] assert (yhat_id == id).all() yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (300, horizon) np.testing.assert_equal(yhat, yhat_loaded) target_value = np.random.rand(300, horizon) target_value = dict({"y": target_value}) assert model.evaluate(x=None, target_value=target_value, metric=['mse']) stop_orca_context()
def test_tcmf_ndarray_covariates_dti(self): ndarray_input = {'id': self.id, 'y': self.data} self.model.fit(ndarray_input, covariates=np.random.rand(3, self.seq_len), dti=pd.date_range('20130101', periods=self.seq_len), **self.fit_params) future_covariates = np.random.randn(3, self.horizon) future_dti = pd.date_range('20130101', periods=self.horizon) # test predict yhat = self.model.predict(horizon=self.horizon, future_covariates=future_covariates, future_dti=future_dti, ) # test save load with tempfile.TemporaryDirectory() as tempdirname: self.model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, is_xshards_distributed=False) yhat_loaded = loaded_model.predict(horizon=self.horizon, future_covariates=future_covariates, future_dti=future_dti, ) yhat_id = yhat_loaded["id"] np.testing.assert_equal(yhat_id, self.id) yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (self.num_samples, self.horizon) np.testing.assert_array_almost_equal(yhat, yhat_loaded, decimal=4) # test evaluate target_value = dict({"y": self.data_new}) assert self.model.evaluate(target_value=target_value, target_covariates=future_covariates, target_dti=future_dti, metric=['mse']) # test fit_incremental self.model.fit_incremental({'y': self.data_new}, covariates_incr=future_covariates, dti_incr=future_dti,) yhat_incr = self.model.predict(horizon=self.horizon, future_covariates=future_covariates, future_dti=future_dti, ) yhat_incr = yhat_incr["prediction"] assert yhat_incr.shape == (self.num_samples, self.horizon) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, yhat, yhat_incr)
def test_forecast_tcmf_without_id(self): model = TCMFForecaster(y_iters=1, init_FX_epoch=1, max_FX_epoch=1, max_TCN_epoch=1, alt_iters=2) horizon = np.random.randint(1, 50) # construct data id = np.arange(200) data = np.random.rand(300, 480) input = dict({'y': "abc"}) with self.assertRaises(Exception) as context: model.fit(input) self.assertTrue("the value of y should be an ndarray" in str(context.exception)) input = dict({'id': id, 'y': data}) with self.assertRaises(Exception) as context: model.fit(input) self.assertTrue("the length of the id array should be equal to the number of" in str(context.exception)) input = dict({'y': data}) model.fit(input) assert not model.is_distributed() with self.assertRaises(Exception) as context: model.fit(input) self.assertTrue('This model has already been fully trained' in str(context.exception)) with tempfile.TemporaryDirectory() as tempdirname: model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, distributed=False) yhat = model.predict(x=None, horizon=horizon) yhat_loaded = loaded_model.predict(x=None, horizon=horizon) assert "id" not in yhat_loaded yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (300, horizon) assert (yhat == yhat_loaded).all() target_value = np.random.rand(300, horizon) target_value_fake = dict({"data": target_value}) with self.assertRaises(Exception) as context: model.evaluate(x=None, target_value=target_value_fake, metric=['mse']) self.assertTrue("key y doesn't exist in y" in str(context.exception)) target_value = dict({"y": target_value}) model.evaluate(x=None, target_value=target_value, metric=['mse'])
def test_forecast_tcmf(self): model = TCMFForecaster(y_iters=1, init_FX_epoch=1, max_FX_epoch=1, max_TCN_epoch=1, alt_iters=2) horizon = np.random.randint(1, 50) # construct data id = np.arange(300) data = np.random.rand(300, 480) input = dict({'data': data}) with self.assertRaises(Exception) as context: model.fit(input) self.assertTrue("key `y` doesn't exist in x" in str(context.exception)) input = dict({'id': id, 'y': data}) with self.assertRaises(Exception) as context: model.is_distributed() self.assertTrue('You should run fit before calling is_distributed()' in str(context.exception)) model.fit(input) assert not model.is_distributed() with self.assertRaises(Exception) as context: model.fit(input) self.assertTrue('This model has already been fully trained' in str(context.exception)) with self.assertRaises(Exception) as context: model.fit(input, incremental=True) self.assertTrue('NotImplementedError' in context.exception.__class__.__name__) with tempfile.TemporaryDirectory() as tempdirname: model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, distributed=False) yhat = model.predict(x=None, horizon=horizon) yhat_loaded = loaded_model.predict(x=None, horizon=horizon) yhat_id = yhat_loaded["id"] assert (yhat_id == id).all() yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (300, horizon) assert (yhat == yhat_loaded).all() target_value = np.random.rand(300, horizon) target_value = dict({"y": target_value}) assert model.evaluate(x=None, target_value=target_value, metric=['mse'])
max_TCN_epoch=1 if args.smoke else 300, alt_iters=2 if args.smoke else 10, ) ymat = np.load( args.data_dir) if not args.use_dummy_data else get_dummy_data() horizon = 24 train_data = ymat[:, :-horizon] target_data = ymat[:, -horizon:] logger.info('Start fitting.') model.fit({'y': train_data}, num_workers=args.num_workers) logger.info('Fitting ends.') # you can save and load model as you want with tempfile.TemporaryDirectory() as tempdirname: model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, distributed=False) if args.predict_local: logger.info( 'Stopping context for yarn cluster and init context on local.') stop_orca_context() import ray ray.init(num_cpus=args.num_predict_cores) logger.info('Start prediction.') yhat = model.predict(x=None, horizon=24, num_workers=args.num_predict_workers if args.predict_local else args.num_workers) logger.info("Prediction ends") yhat = yhat["prediction"]
freq="H", covariates=None, dti=None, period=24, y_iters=1 if args.smoke else 10, init_FX_epoch=1 if args.smoke else 100, max_FX_epoch=1 if args.smoke else 300, max_TCN_epoch=1 if args.smoke else 300, alt_iters=2 if args.smoke else 10, num_workers=args.num_workers) logger.info('Fitting ends.') # you can save and load model as you want with tempfile.TemporaryDirectory() as tempdirname: model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, is_xshards_distributed=False) if args.predict_local: logger.info( 'Stopping context for yarn cluster and init context on local.') stop_orca_context() import ray ray.init(num_cpus=args.num_predict_cores) logger.info('Start prediction.') yhat = model.predict(horizon=horizon, num_workers=args.num_predict_workers if args.predict_local else args.num_workers) logger.info("Prediction ends") yhat = yhat["prediction"] target_value = dict({"y": target_data})
def test_forecast_tcmf_xshards(self): from zoo.orca import OrcaContext import zoo.orca.data.pandas import pandas as pd OrcaContext.pandas_read_backend = "pandas" def preprocessing(df, id_name, y_name): id = df.index data = df.to_numpy() result = dict({id_name: id, y_name: data}) return result def postprocessing(pred_results, output_dt_col_name): id_arr = pred_results["id"] pred_results = pred_results["prediction"] pred_results = np.concatenate( (np.expand_dims(id_arr, axis=1), pred_results), axis=1) final_df = pd.DataFrame(pred_results, columns=["id"] + output_dt_col_name) final_df.id = final_df.id.astype("int") final_df = final_df.set_index("id") final_df.columns.name = "datetime" final_df = final_df.unstack().reset_index().rename( {0: "prediction"}, axis=1) return final_df def get_pred(d): return d["prediction"] with tempfile.NamedTemporaryFile() as temp: data = np.random.rand(300, 480) df = pd.DataFrame(data) df.to_csv(temp.name) shard = zoo.orca.data.pandas.read_csv(temp.name) shard.cache() shard_train = shard.transform_shard(preprocessing, 'id', 'data') with self.assertRaises(Exception) as context: self.model.fit(shard_train) self.assertTrue("key `y` doesn't exist in x" in str(context.exception)) shard_train = shard.transform_shard(preprocessing, 'cid', 'y') with self.assertRaises(Exception) as context: self.model.fit(shard_train) self.assertTrue( "key `id` doesn't exist in x" in str(context.exception)) with self.assertRaises(Exception) as context: self.model.is_xshards_distributed() self.assertTrue( 'You should run fit before calling is_xshards_distributed()' in str(context.exception)) shard_train = shard.transform_shard(preprocessing, 'id', 'y') self.model.fit(shard_train) assert self.model.is_xshards_distributed() with self.assertRaises(Exception) as context: self.model.fit(shard_train) self.assertTrue('This model has already been fully trained' in str( context.exception)) with self.assertRaises(Exception) as context: self.model.fit_incremental(shard_train) self.assertTrue( 'NotImplementedError' in context.exception.__class__.__name__) with tempfile.TemporaryDirectory() as tempdirname: self.model.save(tempdirname + "/model") loaded_model = TCMFForecaster.load(tempdirname + "/model", is_xshards_distributed=True) horizon = np.random.randint(1, 50) yhat_shard_origin = self.model.predict(horizon=horizon) yhat_list_origin = yhat_shard_origin.collect() yhat_list_origin = list(map(get_pred, yhat_list_origin)) yhat_shard = loaded_model.predict(horizon=horizon) yhat_list = yhat_shard.collect() yhat_list = list(map(get_pred, yhat_list)) yhat_origin = np.concatenate(yhat_list_origin) yhat = np.concatenate(yhat_list) assert yhat.shape == (300, horizon) np.testing.assert_equal(yhat, yhat_origin) output_dt_col_name = pd.date_range(start='2020-05-01', periods=horizon, freq='H').to_list() yhat_df_shards = yhat_shard.transform_shard(postprocessing, output_dt_col_name) final_df_list = yhat_df_shards.collect() final_df = pd.concat(final_df_list) final_df.sort_values("datetime", inplace=True) assert final_df.shape == (300 * horizon, 3) OrcaContext.pandas_read_backend = "spark"