示例#1
0
 def test_survival_outcome(self):
     topology = np.zeros((5, 5), dtype=bool)
     topology[3, 0] = topology[4, 0] = topology[3, 1] = topology[
         3, 2] = topology[4, 2] = topology[4, 3] = True
     var_types = [
         "covariate", "covariate", "hidden", "treatment", "outcome"
     ]
     link_types = ["linear"] * 5
     prob_cat = [None] * 5
     prob_cat[3] = [0.2, 0.8]
     outcome_type = "survival"
     snr = 0.95
     treatment_importance = 0.5
     treatment_method = "logistic"
     survival_distribution = "expon"
     survival_baseline = 0.8
     sim = CS3(topology=topology,
               var_types=var_types,
               prob_categories=prob_cat,
               link_types=link_types,
               snr=snr,
               treatment_importances=treatment_importance,
               outcome_types=outcome_type,
               effect_sizes=self.no_X.effect_sizes,
               treatment_methods=treatment_method,
               survival_distribution=survival_distribution,
               survival_baseline=survival_baseline)
     num_samples = self.NUM_SAMPLES
     X, prop, cf = sim.generate_data(num_samples=num_samples)
示例#2
0
 def test_multi_categorical_treatment(self):
     t_probs = pd.Series([0.2, 0.2, 0.1, 0.5])
     prob_cat = [None, None, t_probs, None]
     treatment_methods = ["quantile_gauss_fit", "odds_ratio"]
     decimals = [1, 1]
     for treatment_method, decimal in zip(treatment_methods, decimals):
         sim = CS3(topology=self.no_X.topology,
                   var_types=self.no_X.var_types,
                   prob_categories=prob_cat,
                   link_types=self.no_X.link_types,
                   treatment_importances=self.no_X.treatment_importance,
                   outcome_types=self.no_X.outcome_types,
                   snr=self.no_X.snr,
                   effect_sizes=self.no_X.effect_sizes,
                   treatment_methods=treatment_method)
         n = self.NUM_SAMPLES * 10
         X, prop, cf = sim.generate_data(num_samples=n)
         np.testing.assert_array_almost_equal(
             prop.sum(axis="columns"),
             np.ones(n),
             err_msg=
             "multi-categorical preopensities of treatment method {method} "
             "does not sum to 1".format(method=treatment_method))
         np.testing.assert_array_almost_equal(
             np.array(X[2].value_counts(normalize=True) - t_probs),
             np.zeros(4),
             decimal=decimal,
             err_msg=
             "treatment method {method} does not produce proportions as "
             "required".format(method=treatment_method))
示例#3
0
 def test_treatment_random(self):
     topology = np.zeros((6, 6), dtype=bool)
     topology[2, 0] = topology[3, 0] = topology[2, 1] = topology[
         3, 1] = topology[4, 2] = topology[5, 3] = True
     var_types = [
         "covariate", "covariate", "treatment", "treatment", "outcome",
         "outcome"
     ]
     link_types = ["linear"] * 6
     prob_cat = [None] * 6
     prob_cat[2] = [0.5, 0.5]
     prob_cat[3] = [0.2, 0.8]
     sim = CS3(topology=topology,
               var_types=var_types,
               prob_categories=prob_cat,
               link_types=link_types,
               snr=self.no_X.snr,
               treatment_importances=self.no_X.treatment_importance,
               outcome_types=["continuous", "continuous"],
               effect_sizes=self.no_X.effect_sizes,
               treatment_methods=["random", "random"])
     num_samples = self.NUM_SAMPLES * 10
     X, prop, cf = sim.generate_data(num_samples=num_samples)
     np.testing.assert_array_equal(prop[2][1], [0.5] * num_samples)
     np.testing.assert_array_equal(prop[3][1], [0.8] * num_samples)
     hist = X[2].value_counts(normalize=True)
     np.testing.assert_almost_equal(hist, [0.5, 0.5], decimal=2)
     hist = X[3].value_counts(normalize=True)
     np.testing.assert_almost_equal(hist.sort_index(), [0.2, 0.8],
                                    decimal=2)
示例#4
0
    def test_linear_linking(self):
        topology = np.zeros((3, 3), dtype=bool)
        topology[2, 0] = topology[2, 1] = True
        var_types = ["covariate", "treatment", "outcome"]
        snr = 1
        prob_cat = [None, [0.5, 0.5], None]
        treatment_importance = 0.5
        sim = CS3(topology=topology,
                  var_types=var_types,
                  prob_categories=prob_cat,
                  link_types="linear",
                  treatment_importances=treatment_importance,
                  outcome_types=self.no_X.outcome_types,
                  snr=snr,
                  effect_sizes=self.no_X.effect_sizes)
        X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES)

        singular_values = np.linalg.svd(X.values, compute_uv=False)
        eps = 1e-10
        rank = np.sum(singular_values > eps)
        self.assertEqual(
            rank,
            2,
            msg="discovered rank of matrix is {emp} instead of {des}."
            "so the linear linking does not work properly".format(emp=rank,
                                                                  des=2))
示例#5
0
 def test_multi_treatment_outcome(self):
     topology = np.zeros((6, 6), dtype=bool)
     topology[2, 0] = topology[3, 0] = topology[2, 1] = topology[
         3, 1] = topology[4, 2] = topology[5, 3] = True
     var_types = [
         "covariate", "covariate", "treatment", "treatment", "outcome",
         "outcome"
     ]
     link_types = ["linear"] * 6
     prob_cat = [None] * 6
     prob_cat[2] = prob_cat[3] = [0.5, 0.5]
     sim = CS3(topology=topology,
               var_types=var_types,
               prob_categories=prob_cat,
               link_types=link_types,
               snr=self.no_X.snr,
               treatment_importances=self.no_X.treatment_importance,
               outcome_types=["continuous", "continuous"],
               effect_sizes=self.no_X.effect_sizes)
     X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES)
     self.assertEqual(prop.shape, (self.NUM_SAMPLES, 4),
                      msg="Generated propensity shape is {X} "
                      "but supposed to be {supp}".format(
                          X=prop.shape, supp=(self.NUM_SAMPLES, 4)))
     self.assertEqual(cf.shape, (self.NUM_SAMPLES, 4),
                      msg="Number of generated counterfactuals is {X} "
                      "but supposed to be {supp}".format(
                          X=cf.shape, supp=(self.NUM_SAMPLES, 4)))
示例#6
0
 def test_different_types_of_paramaters(self):
     """
     Tests to see what happens when supplying parameters of different types (lists, dicts, arrays, etc.)
     """
     topology = np.zeros(
         (5, 5),
         dtype=np.bool)  # topology[i,j] if node j is a parent of node i
     topology[1, 0] = topology[2, 0] = topology[2, 1] = topology[
         3, 1] = topology[3, 2] = topology[3, 4] = True
     var_types = [
         "hidden", "covariate", "treatment", "outcome", "covariate"
     ]
     sim = CS3(
         topology=topology,
         var_types=var_types,
         prob_categories=self.no_X.prob_cat + [None],
         link_types=None,
         treatment_importances=pd.Series(data=0.7, index=[2]),
         outcome_types={3: "continuous"},
         snr=0.5,
         # effect_sizes={2: 0.8},
         effect_sizes=[0.8],
         treatment_methods=["gaussian"])
     self.assertTrue(all(sim.link_types == "linear"))
     self.assertEqual(len(sim.link_types), 5)
     self.assertTrue(all([x == 0.7 for x in sim.treatment_importances]))
     self.assertTrue(sim.outcome_types.equals(pd.Series({3: "continuous"})))
     self.assertTrue(all(sim.snr == 0.5))
     self.assertEqual(len(sim.snr), 5)
     self.assertTrue(sim.effect_sizes.equals(pd.Series({3: 0.8})))
示例#7
0
 def test_dependency_from_topology(self):
     """
     Tests to see that the matrix topology is well converted into graph dependencies for with and without dataset.
     """
     sim = CS3(topology=self.no_X.topology,
               var_types=self.no_X.var_types,
               prob_categories=self.no_X.prob_cat,
               link_types=self.no_X.link_types,
               treatment_importances=self.no_X.treatment_importance,
               outcome_types=self.no_X.outcome_types,
               snr=self.no_X.snr,
               effect_sizes=self.no_X.effect_sizes)
     self.dependency_from_topology(sim)
     sim = CS3(topology=self.with_X.topology,
               var_types=self.with_X.var_types,
               prob_categories=self.with_X.prob_cat,
               link_types=self.with_X.link_types,
               treatment_importances=self.with_X.treatment_importance,
               outcome_types=self.with_X.outcome_types,
               snr=self.with_X.snr,
               effect_sizes=self.with_X.effect_sizes)
     self.dependency_from_topology(sim)
示例#8
0
 def test_treatment_logistic(self):
     topology = np.zeros((6, 6), dtype=bool)
     topology[2, 0] = topology[3, 0] = topology[2, 1] = topology[
         3, 1] = topology[4, 2] = topology[5, 3] = True
     var_types = [
         "covariate", "covariate", "treatment", "treatment", "outcome",
         "outcome"
     ]
     link_types = ["linear"] * 6
     prob_cat = [None] * 6
     prob_cat[2] = prob_cat[3] = [0.5, 0.5]
     sim = CS3(topology=topology,
               var_types=var_types,
               prob_categories=prob_cat,
               link_types=link_types,
               snr=self.no_X.snr,
               treatment_importances=self.no_X.treatment_importance,
               outcome_types=["continuous", "continuous"],
               effect_sizes=self.no_X.effect_sizes,
               treatment_methods=["logistic", "logistic"])
     X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES)
示例#9
0
 def test_categorical_proportions(self):
     probs = np.array([0.25, 0.25, 0.5])
     prob_cat = self.no_X.prob_cat
     prob_cat[1] = probs
     sim = CS3(topology=self.no_X.topology,
               var_types=self.no_X.var_types,
               prob_categories=prob_cat,
               link_types=self.no_X.link_types,
               treatment_importances=self.no_X.treatment_importance,
               outcome_types=self.no_X.outcome_types,
               snr=self.no_X.snr,
               effect_sizes=self.no_X.effect_sizes)
     X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES * 10,
                                     random_seed=0)
     # hist = np.array(X.loc[:, 1].value_counts(normalize=True))
     hist = X.loc[:, 1].value_counts(normalize=True)
     probs = pd.Series(probs)
     np.testing.assert_array_almost_equal(
         probs - hist,
         pd.Series(data=0, index=probs.index),
         decimal=2,
         err_msg="Empirical distribution {emp} of categories "
         "is too far from desired distribution {des}".format(emp=hist,
                                                             des=probs))
示例#10
0
    def test_bad_input(self):
        # lengths:
        # with self.assertRaises(ValueError) as assert_checker:
        var_types = ["covariate", "treatment", "outcome"]
        self.assertRaises(ValueError,
                          CS3,
                          topology=self.no_X.topology,
                          var_types=var_types,
                          prob_categories=self.no_X.prob_cat,
                          link_types=self.no_X.link_types,
                          treatment_importances=self.no_X.treatment_importance,
                          snr=self.no_X.snr,
                          outcome_types=self.no_X.outcome_types,
                          effect_sizes=self.no_X.effect_sizes)

        # outcome has more than one treatment predecessor:
        var_types = ["covariate", "treatment", "treatment", "outcome"]
        self.assertRaises(ValueError,
                          CS3,
                          topology=self.no_X.topology,
                          var_types=var_types,
                          prob_categories=self.no_X.prob_cat,
                          link_types=self.no_X.link_types,
                          treatment_importances=self.no_X.treatment_importance,
                          snr=self.no_X.snr,
                          outcome_types=self.no_X.outcome_types,
                          effect_sizes=self.no_X.effect_sizes)

        # No valid link type:
        self.assertRaises(ValueError,
                          CS3,
                          topology=self.no_X.topology,
                          var_types=self.no_X.var_types,
                          prob_categories=self.no_X.prob_cat,
                          link_types=["linear", "linear", "linear", "leniar"],
                          treatment_importances=self.no_X.treatment_importance,
                          snr=self.no_X.snr,
                          outcome_types=self.no_X.outcome_types,
                          effect_sizes=self.no_X.effect_sizes)

        # No valid treatment method:
        self.assertRaises(ValueError,
                          CS3,
                          topology=self.no_X.topology,
                          var_types=self.no_X.var_types,
                          prob_categories=self.no_X.prob_cat,
                          link_types=self.no_X.link_types,
                          treatment_importances=self.no_X.treatment_importance,
                          snr=self.no_X.snr,
                          outcome_types=self.no_X.outcome_types,
                          effect_sizes=self.no_X.effect_sizes,
                          treatment_methods="rndom")

        # lengths:
        self.assertRaises(ValueError,
                          CS3,
                          topology=self.no_X.topology,
                          var_types=self.no_X.var_types,
                          prob_categories=self.no_X.prob_cat,
                          link_types=self.no_X.link_types,
                          treatment_importances=self.no_X.treatment_importance,
                          snr=[0, 1],
                          outcome_types=self.no_X.outcome_types,
                          effect_sizes=self.no_X.effect_sizes,
                          treatment_methods="gaussian")
        self.assertRaises(ValueError,
                          CS3,
                          topology=self.no_X.topology,
                          var_types=self.no_X.var_types,
                          prob_categories=self.no_X.prob_cat,
                          link_types=self.no_X.link_types,
                          treatment_importances=[0.5, 0.5],
                          snr=self.no_X.snr,
                          treatment_methods="gaussian",
                          outcome_types=self.no_X.outcome_types,
                          effect_sizes=self.no_X.effect_sizes)

        # no generation input:
        sim = CS3(topology=self.no_X.topology,
                  var_types=self.no_X.var_types,
                  prob_categories=self.no_X.prob_cat,
                  link_types=self.no_X.link_types,
                  treatment_importances=self.no_X.treatment_importance,
                  snr=self.no_X.snr,
                  outcome_types=self.no_X.outcome_types,
                  effect_sizes=self.no_X.effect_sizes)
        self.assertRaises(ValueError, sim.generate_data)

        # categorical treatment:
        self.assertRaises(ValueError,
                          sim.generate_treatment_col,
                          X_parents=pd.DataFrame([None]),
                          link_type=None,
                          snr=1,
                          prob_category=None)

        # wrong probabilities:
        prob_cat = [[0.5, -0.5, 1], None, [0.5, 0.5], None]
        sim = CS3(topology=self.no_X.topology,
                  var_types=self.no_X.var_types,
                  prob_categories=prob_cat,
                  link_types=self.no_X.link_types,
                  treatment_importances=self.no_X.treatment_importance,
                  snr=self.no_X.snr,
                  outcome_types=self.no_X.outcome_types,
                  effect_sizes=self.no_X.effect_sizes)
        self.assertRaises(ValueError, sim.generate_data, num_samples=100)
        prob_cat = [None, None, [0.5, 0.6], None]
        sim = CS3(topology=self.no_X.topology,
                  var_types=self.no_X.var_types,
                  prob_categories=prob_cat,
                  link_types=self.no_X.link_types,
                  treatment_importances=self.no_X.treatment_importance,
                  snr=self.no_X.snr,
                  outcome_types=self.no_X.outcome_types,
                  effect_sizes=self.no_X.effect_sizes)
        self.assertRaises(ValueError, sim.generate_data, num_samples=100)
示例#11
0
    def test_censoring(self):
        # survival censor
        topology = np.zeros((5, 5), dtype=bool)
        topology[2, 0] = topology[3, 0] = topology[4, 0] = topology[
            2, 1] = topology[3, 1] = topology[4, 1] = topology[
                3, 2] = topology[4, 2] = topology[
                    4, 3] = True  # make censor be dependent like the outcome
        var_types = [
            "covariate", "covariate", "treatment", "censor", "outcome"
        ]
        link_types = ["linear"] * 5
        prob_cat = [None, None, [0.2, 0.8], [0.85, 0.15], None]
        outcome_type = "survival"
        snr = 0.95
        treatment_importance = 0.5
        treatment_method = "logistic"
        survival_distribution = "expon"
        survival_baseline = 0.8
        sim = CS3(topology=topology,
                  var_types=var_types,
                  prob_categories=prob_cat,
                  link_types=link_types,
                  snr=snr,
                  treatment_importances=treatment_importance,
                  outcome_types=outcome_type,
                  effect_sizes=self.no_X.effect_sizes,
                  treatment_methods=treatment_method,
                  survival_distribution=survival_distribution,
                  survival_baseline=survival_baseline)
        num_samples = self.NUM_SAMPLES
        X, prop, cf = sim.generate_data(num_samples=num_samples,
                                        random_seed=783454)
        self.assertAlmostEqual(np.abs(X[4].le(X[3]).sum() / num_samples),
                               prob_cat[3][0],
                               places=1)
        # df_obs, df_cf = sim.format_for_training(X, prop, cf)

        # binary censor
        topology = np.zeros((5, 5), dtype=bool)
        topology[2, 0] = topology[4, 0] = topology[2, 1] = topology[
            3, 1] = topology[4, 2] = topology[4, 3] = True
        var_types = [
            "covariate", "covariate", "treatment", "censor", "outcome"
        ]
        link_types = ["linear"] * 5
        prob_cat = [None, None, [0.2, 0.8], [0.85, 0.15], None]
        outcome_type = "continuous"
        snr = 0.95
        treatment_importance = 0.5
        treatment_method = "logistic"
        sim = CS3(topology=topology,
                  var_types=var_types,
                  prob_categories=prob_cat,
                  link_types=link_types,
                  snr=snr,
                  treatment_importances=treatment_importance,
                  outcome_types=outcome_type,
                  effect_sizes=self.no_X.effect_sizes,
                  treatment_methods=treatment_method)
        num_samples = self.NUM_SAMPLES
        X, prop, cf = sim.generate_data(num_samples=num_samples)
        self.assertAlmostEqual(X[3].astype(int).sum() / num_samples,
                               prob_cat[3][1])
        df_obs, df_cf = sim.format_for_training(X, prop, cf)
        self.assertEqual(X[3].astype(int).sum(), df_obs["y_4"].isnull().sum())

        # independent categorical censor
        topology = np.zeros((5, 5), dtype=bool)
        topology[3, 0] = topology[4, 0] = topology[3, 2] = topology[
            4, 2] = topology[4, 3] = True
        var_types = [
            "covariate", "covariate", "treatment", "censor", "outcome"
        ]
        link_types = ["linear"] * 5
        prob_cat = [None, None, [0.2, 0.8], [0.85, 0.15], None]
        outcome_type = "continuous"
        snr = 0.95
        treatment_importance = 0.5
        treatment_method = "logistic"
        survival_distribution = "expon"
        survival_baseline = 0.8
        sim = CS3(topology=topology,
                  var_types=var_types,
                  prob_categories=prob_cat,
                  link_types=link_types,
                  snr=snr,
                  treatment_importances=treatment_importance,
                  outcome_types=outcome_type,
                  effect_sizes=self.no_X.effect_sizes,
                  treatment_methods=treatment_method,
                  survival_distribution=survival_distribution,
                  survival_baseline=survival_baseline)
        num_samples = self.NUM_SAMPLES
        X, prop, cf = sim.generate_data(num_samples=num_samples)
        self.assertAlmostEqual(X[3].astype(int).sum() / num_samples,
                               prob_cat[3][1])
        # df_obs, df_cf = sim.format_for_training(X, prop, cf)

        # independent survival censor
        topology = np.zeros((5, 5), dtype=bool)
        topology[3, 0] = topology[4, 0] = topology[3, 2] = topology[
            4, 2] = topology[4, 3] = True
        var_types = [
            "covariate", "covariate", "treatment", "censor", "outcome"
        ]
        link_types = ["linear"] * 5
        prob_cat = [None, None, [0.2, 0.8], [0.85, 0.15], None]
        outcome_type = "survival"
        snr = 0.95
        treatment_importance = 0.5
        treatment_method = "logistic"
        survival_distribution = "expon"
        survival_baseline = 0.8
        sim = CS3(topology=topology,
                  var_types=var_types,
                  prob_categories=prob_cat,
                  link_types=link_types,
                  snr=snr,
                  treatment_importances=treatment_importance,
                  outcome_types=outcome_type,
                  effect_sizes=self.no_X.effect_sizes,
                  treatment_methods=treatment_method,
                  survival_distribution=survival_distribution,
                  survival_baseline=survival_baseline)
        num_samples = 10000
        X, prop, cf = sim.generate_data(num_samples=num_samples)
示例#12
0
    def test_dataset_size(self):
        """
        Tests to see the the size of the generated dataset is ok under several configurations
        """
        # No given X, all non-hidden
        var_types = ["covariate", "covariate", "treatment", "outcome"]
        sim = CS3(topology=self.no_X.topology,
                  var_types=var_types,
                  prob_categories=self.no_X.prob_cat,
                  link_types=self.no_X.link_types,
                  treatment_importances=self.no_X.treatment_importance,
                  outcome_types=self.no_X.outcome_types,
                  snr=self.no_X.snr,
                  effect_sizes=self.no_X.effect_sizes)
        X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES)
        self.assertEqual(X.shape, (self.NUM_SAMPLES, 4),
                         msg="Generated dataset shape is {X} "
                         "but supposed to be {supp}".format(
                             X=X.shape, supp=(self.NUM_SAMPLES, 4)))
        self.assertEqual(prop.shape, (self.NUM_SAMPLES, 2),
                         msg="Generated propensity shape is {X} "
                         "but supposed to be {supp}".format(
                             X=prop.shape, supp=(self.NUM_SAMPLES, 2)))
        self.assertEqual(cf.shape, (self.NUM_SAMPLES, 2),
                         msg="number of generated counterfactuals is {X} "
                         "but supposed to be {supp}".format(
                             X=cf.shape, supp=(self.NUM_SAMPLES, 2)))
        df_obs, df_cf = sim.format_for_training(X, prop, cf)
        self.assertEqual(df_obs.shape, (self.NUM_SAMPLES, 4),
                         msg="Generated dataset shape is {X} "
                         "but supposed to be {supp}".format(
                             X=df_obs.shape, supp=(self.NUM_SAMPLES, 4)))

        # No given X, with hidden
        var_types = ["hidden", "hidden", "treatment", "outcome"]
        sim = CS3(topology=self.no_X.topology,
                  var_types=var_types,
                  prob_categories=self.no_X.prob_cat,
                  link_types=self.no_X.link_types,
                  treatment_importances=self.no_X.treatment_importance,
                  outcome_types=self.no_X.outcome_types,
                  snr=self.no_X.snr,
                  effect_sizes=self.no_X.effect_sizes)
        X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES)
        df_obs, df_cf = sim.format_for_training(X, prop, cf)
        self.assertEqual(df_obs.shape, (self.NUM_SAMPLES, 2),
                         msg="Generated dataset shape is {X} "
                         "but supposed to be {supp}".format(
                             X=df_obs.shape, supp=(self.NUM_SAMPLES, 2)))

        # Given X, with hidden vars
        sim = CS3(topology=self.with_X.topology,
                  var_types=self.with_X.var_types,
                  prob_categories=self.with_X.prob_cat,
                  link_types=self.with_X.link_types,
                  treatment_importances=self.with_X.treatment_importance,
                  outcome_types=self.with_X.outcome_types,
                  snr=self.with_X.snr,
                  effect_sizes=self.with_X.effect_sizes)
        X, prop, cf = sim.generate_data(X_given=self.X_GIVEN)
        self.assertEqual(X.shape, (self.X_NUM_SAMPLES, 9),
                         msg="Generated dataset shape is {X} "
                         "but supposed to be {supp}".format(
                             X=X.shape, supp=(self.X_NUM_SAMPLES, 9)))
        self.assertEqual(prop.shape, (self.X_NUM_SAMPLES, 2),
                         msg="Generated propensity shape is {X} "
                         "but supposed to be {supp}".format(
                             X=prop.shape, supp=(self.X_NUM_SAMPLES, 2)))
        self.assertEqual(cf.shape, (self.X_NUM_SAMPLES, 2),
                         msg="Number of counterfactuals generated is {X} "
                         "but supposed to be {supp}".format(
                             X=cf.shape, supp=(self.X_NUM_SAMPLES, 2)))
        df_obs, df_cf = sim.format_for_training(X, prop, cf)
        self.assertEqual(df_obs.shape, (self.X_NUM_SAMPLES, 7),
                         msg="Generated dataset shape is {X} "
                         "but supposed to be {supp}".format(
                             X=df_obs.shape, supp=(self.X_NUM_SAMPLES, 7)))