def test_dataset_twins_load(train_ratio: float, downsample: Optional[int]) -> None: # Data Input (11400 patients, 30 features, 2 potential outcomes) total = 11400 feat_count = 30 outcomes = 2 [Train_X, Train_T, Train_Y, Opt_Train_Y, Test_X, Test_Y] = ds.load( "twins", train_ratio, downsample=downsample, ) train_cnt = int(total * train_ratio) test_cnt = total - train_cnt if downsample: train_cnt = min(downsample, train_cnt) test_cnt = min(downsample, test_cnt) assert Train_X.shape == (train_cnt, feat_count) assert Train_T.shape == (train_cnt, ) assert Train_Y.shape == (train_cnt, ) assert Opt_Train_Y.shape == (train_cnt, outcomes) assert Test_X.shape == (test_cnt, feat_count) assert Test_Y.shape == (test_cnt, outcomes)
def test_unified_api_cmgp() -> None: train_ratio = 0.8 dataset = ds.load( "twins", train_ratio, downsample=1000, ) [Train_X, Train_T, Train_Y, Opt_Train_Y, Test_X, Test_Y] = dataset dim = len(Train_X[0]) dim_outcome = Test_Y.shape[1] model = Model( "CMGP", dim=dim, dim_outcome=dim_outcome, max_gp_iterations=50, ) assert model is not None metrics = model.train(*dataset) metrics.print() predicted = model.predict(Test_X) assert predicted.shape == (Test_X.shape[0], 2) test_metrics = model.test(Test_X, Test_Y) test_metrics.print()
def search(algorithm: str, iterations: int = 2000) -> List[Any]: assert algorithm in ["GANITE", "GANITE_TORCH"] # load dataset dataset = ds.load("twins", 0.8) [Train_X, Train_T, Train_Y, Opt_Train_Y, Test_X, Test_Y] = dataset dim = len(Train_X[0]) dim_outcome = Test_Y.shape[1] # define the space of hyperparameters to search search_space = list() search_space.append(Integer(3, 10, name="num_discr_iterations")) search_space.append(Categorical([32, 64, 128, 256], name="minibatch_size")) search_space.append( Categorical( [dim, int(dim / 2), int(dim / 3), int(dim / 4), int(dim / 5)], name="dim_hidden", )) search_space.append(Categorical([0, 0.1, 0.5, 1, 2, 5, 10], name="alpha")) search_space.append(Categorical([0, 0.1, 0.5, 1, 2, 5, 10], name="beta")) search_space.append(Integer(1, 9, name="depth")) # define the function used to evaluate a given configuration @use_named_args(search_space) def evaluate_model(**params: Any) -> float: # configure the model with specific hyperparameters model_class: Any if algorithm == "GANITE": model_class = Ganite elif algorithm == "GANITE_TORCH": model_class = GaniteTorch else: raise Exception(f"model not supported {model_class}") model = model_class( dim, dim_outcome, num_iterations=iterations, **params, ) model.train(*dataset) test_metrics = model.test(Test_X, Test_Y) return test_metrics.sqrt_PEHE() # perform optimization result = gp_minimize(evaluate_model, search_space) return result.x
def test_ganite_torch_short_training( plt: Any, iterations: int, num_discr_iterations: int, alpha: float, beta: float, batch_size: int, depth: int, dim_hidden: int, ) -> None: train_ratio = 0.8 dataset = ds.load("twins", train_ratio) [Train_X, Train_T, Train_Y, Opt_Train_Y, Test_X, Test_Y] = dataset dim = len(Train_X[0]) dim_hidden = dim if dim_hidden == 0 else dim_hidden dim_outcome = Test_Y.shape[1] model = alg.GaniteTorch( dim, dim_outcome, dim_hidden=dim_hidden, num_iterations=iterations, alpha=alpha, beta=beta, minibatch_size=batch_size, depth=depth, num_discr_iterations=num_discr_iterations, ) assert model is not None metrics = model.train(*dataset) metrics.print() try: metrics.plot(plt, thresholds=[0.2, 0.25, 0.3, 0.35]) except BaseException as e: print("failed to plot(maybe rerun with --plots):", e) predicted = model.predict(Test_X) assert predicted.shape == (Test_X.shape[0], 2) test_metrics = model.test(Test_X, Test_Y) test_metrics.print() print("Top 5 worst errors ", Test_X[test_metrics.worst_mistakes()]) assert 0.2 < test_metrics.sqrt_PEHE() and test_metrics.sqrt_PEHE() < 0.4
def test_cmgp_short_training(plt: Any, ) -> None: train_ratio = 0.8 [Train_X, Train_T, Train_Y, Opt_Train_Y, Test_X, Test_Y] = ds.load( "twins", train_ratio, downsample=1000, ) dim = len(Train_X[0]) dim_outcome = Test_Y.shape[1] model = alg.CMGP( dim=dim, dim_outcome=dim_outcome, max_gp_iterations=50, ) assert model is not None for experiment in range(4): dataset = ds.load( "twins", train_ratio, downsample=1000, ) metrics = model.train(*dataset) test_metrics = model.test(Test_X, Test_Y) assert 0.2 < test_metrics.sqrt_PEHE() and test_metrics.sqrt_PEHE( ) < 0.4 metrics.print() try: metrics.plot(plt, with_ci=True, thresholds=[0.2, 0.25, 0.3, 0.35]) except BaseException as e: print("failed to plot(maybe rerun with --plots):", e)
def test_unified_api_ganite(ganite_ver: str) -> None: train_ratio = 0.8 dataset = ds.load("twins", train_ratio) [Train_X, Train_T, Train_Y, Opt_Train_Y, Test_X, Test_Y] = dataset dim = len(Train_X[0]) dim_hidden = dim dim_outcome = Test_Y.shape[1] model = Model( ganite_ver, dim, dim_outcome, dim_hidden=dim_hidden, num_iterations=10, alpha=2, beta=2, minibatch_size=128, depth=2, num_discr_iterations=4, ) assert model.core.minibatch_size == 128 assert model.core.alpha == 2 assert model.core.beta == 2 assert model.core.depth == 2 assert model.core.num_iterations == 10 assert model.core.num_discr_iterations == 4 metrics = model.train(*dataset) metrics.print() predicted = model.predict(Test_X) assert predicted.shape == (Test_X.shape[0], 2) test_metrics = model.test(Test_X, Test_Y) test_metrics.print()
# Import depends import ite.algs.ganite_torch.model as alg import ite.datasets as ds import ite.utils.numpy as utils # Double check that we are using the correct interpreter. print(sys.executable) # ## Load the Dataset # # Next, we load the Twins dataset, process the data, and sample a training set and a test set. # train_ratio = 0.8 dataset = ds.load("twins", train_ratio) [Train_X, Train_T, Train_Y, Opt_Train_Y, Test_X, Test_Y] = dataset pd.DataFrame(data=Train_X[:5]) # ## Load the model # # Next, we define the model. # # # The constructor supports the following parameters: # - `dim`: The number of features in X. # - `dim_outcome`: The number of potential outcomes. # - `dim_hidden`: hyperparameter for tuning the size of the hidden layer. # - `depth`: hyperparameter for the number of hidden layers in the generator and inference blocks. # - `num_iterations`: hyperparameter for the number of training epochs.
def test_sanity() -> None: with pytest.raises(BaseException): ds.load("test")
import ite.datasets as ds # Double check that we are using the correct interpreter. print(sys.executable) # ## Load the Dataset # # The example is done using the Twins dataset. # # Next, we load the dataset, process the data, and sample a training set and a test set. # # For CGMP, we have to downsample to 1000 training items. For the rest, we load without downsampling. train_ratio = 0.8 full_dataloader = ds.load("twins", train_ratio) cmgp_dataloader = ds.load("twins", train_ratio, downsample=1000) # ## Load and train GANITE(Tensorflow version) # # The constructor requires the name of the chosen algorithm for the first parameter - `GANITE`. # # The constructor supports the same parameters as the "native" version: # - `dim`: The number of features in X. # - `dim_outcome`: The number of potential outcomes. # - `dim_hidden`: hyperparameter for tuning the size of the hidden layer. # - `depth`: hyperparameter for the number of hidden layers in the generator and inference blocks. # - `num_iterations`: hyperparameter for the number of training epochs. # - `alpha`: hyperparameter used for the Generator block loss. # - `beta`: hyperparameter used for the ITE block loss.
# Double check that we are using the correct interpreter. print(sys.executable) # Disable TF logging os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # ## Load the Dataset # # The example is done using the Twins dataset. # # Next, we load the dataset, process the data, and sample a training set and a test set. train_ratio = 0.8 dataloader = ds.load("twins", train_ratio) [Train_X, Train_T, Train_Y, Opt_Train_Y, Test_X, Test_Y] = dataloader pd.DataFrame(data=Train_X[:5]) # ## Load the model # # Next, we define the model. # # # The constructor supports the following parameters: # - `dim`: The number of features in X. # - `dim_outcome`: The number of potential outcomes. # - `dim_hidden`: hyperparameter for tuning the size of the hidden layer.
import ite.algs.causal_multitask_gaussian_processes.model as alg import ite.datasets as ds import ite.utils.numpy as utils # Double check that we are using the correct interpreter. print(sys.executable) # ## Load the Dataset # # The example is done using the Twins dataset. # # __Important__: For CGMP, we have to downsample the dataset to 1000 training items. train_ratio = 0.8 dataset = ds.load("twins", train_ratio, downsample=1000) [Train_X, Train_T, Train_Y, Opt_Train_Y, Test_X, Test_Y] = dataset # ## Load the model # # Next, we define the model. # # # The constructor supports the following parameters: # - `dim`: The number of features in X. # - `dim_outcome`: The number of potential outcomes. # - `max_gp_iterations`: Maximum number of GP iterations before stopping the training. dim = len(Train_X[0]) dim_outcome = Test_Y.shape[1]