示例#1
0
    def refute_estimate(self):

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info(
            "Refutation over {} simulated datasets of size {} each".format(
                self._subset_fraction,
                self._subset_fraction * len(self._data.index)))

        for index in range(self._num_simulations):
            if self._random_state is None:
                new_data = self._data.sample(frac=self._subset_fraction)
            else:
                new_data = self._data.sample(frac=self._subset_fraction,
                                             random_state=self._random_state)

            new_estimator = self.get_estimator_object(new_data,
                                                      self._target_estimand,
                                                      self._estimate)
            new_effect = new_estimator.estimate_effect()
            sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Use a subset of data")

        # We want to see if the estimate falls in the same distribution as the one generated by the refuter
        # Ideally that should be the case as choosing a subset should not have a significant effect on the ability
        # of the treatment to affect the outcome
        refute.add_significance_test_results(
            self.test_significance(self._estimate.value, sample_estimates))

        return refute
示例#2
0
    def refute_estimate(self, *args, **kwargs):
        if self._sample_size > len(self._data):
                self.logger.warning("The sample size is larger than the population size")

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info("Refutation over {} simulated datasets of size {} each"
                         .format(self._num_simulations
                         ,self._sample_size )
                        ) 
        
        for index in range(self._num_simulations):
            if self._random_state is None:
                new_data = resample(self._data, 
                                n_samples=self._sample_size )
            else:
                new_data = resample(self._data,
                                    n_samples=self._sample_size,
                                    random_state=self._random_state )

            if self._chosen_variables is not None:
                for variable in self._chosen_variables:
                    
                    if ('float' or 'int') in new_data[variable].dtype.name:
                        scaling_factor = new_data[variable].std() 
                        new_data[variable] += np.random.normal(loc=0.0, scale=self._noise * scaling_factor,size=self._sample_size) 
                    
                    elif 'bool' in new_data[variable].dtype.name:
                        probs = np.random.uniform(0, 1, self._sample_size )
                        new_data[variable] = np.where(probs < self._probability_of_change, 
                                                        np.logical_not(new_data[variable]), 
                                                        new_data[variable]) 
                    
                    elif 'category' in new_data[variable].dtype.name:
                        categories = new_data[variable].unique()
                        # Find the set difference for each row
                        changed_data = new_data[variable].apply( lambda row: list( set(categories) - set([row]) ) )
                        # Choose one out of the remaining
                        changed_data = changed_data.apply( lambda row: random.choice(row)  )
                        new_data[variable] = np.where(probs < self._probability_of_change, changed_data)
                        new_data[variable].astype('category')

            new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate)
            new_effect = new_estimator.estimate_effect()
            sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Bootstrap Sample Dataset"
        )

        # We want to see if the estimate falls in the same distribution as the one generated by the refuter
        # Ideally that should be the case as running bootstrap should not have a significant effect on the ability
        # of the treatment to affect the outcome
        refute.add_significance_test_results(
            self.test_significance(self._estimate, sample_estimates)
        )

        return refute
示例#3
0
    def refute_estimate(self):

        # We need to change the identified estimand
        # We thus, make a copy. This is done as we don't want
        # to change the original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.outcome_variable = ["dummy_outcome"]

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info("Refutation over {} simulated datasets of {} treatment"
                        .format(self._num_simulations
                        ,self._dummy_outcome_type)
                        )
        num_rows =  self._data.shape[0]

        for index in range(self._num_simulations):

            if self._dummy_outcome_type == "permute":
                if self._random_state is None:
                    new_outcome = self._data[self._outcome_name].sample(frac=1).values
                else:
                    new_outcome = self._data[self._outcome_name].sample(frac=1,
                                                                random_state=self._random_state).values
            else:
                new_outcome = np.random.randn(num_rows)

        # Create a new column in the data by the name of dummy_outcome
        new_data = self._data.assign(dummy_outcome=new_outcome)

        # Sanity check the data
        self.logger.debug(new_data[0:10])

        new_estimator = self.get_estimator_object(new_data, identified_estimand, self._estimate)
        new_effect = new_estimator.estimate_effect()
        sample_estimates[index] = new_effect.value

        refute = CausalRefutation(self._estimate.value,
                                        np.mean(sample_estimates),
                                        refutation_type="Refute: Use a Dummy Outcome")
        
        # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter
        # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal 
        # relationship between the treatment and the outcome.

        dummy_estimator = copy.deepcopy(self._estimate)
        dummy_estimator.value = 0

        refute.add_significance_test_results(
            self.test_significance(dummy_estimator, sample_estimates)
        )

        return refute
示例#4
0
    def refute_estimate(self):
        num_rows = self._data.shape[0]
        new_data = self._data.assign(w_random=np.random.randn(num_rows))
        new_backdoor_variables = self._target_estimand.get_backdoor_variables() + ['w_random']
        identified_estimand = copy.deepcopy(self._target_estimand)
        # Adding a new backdoor variable to the identified estimand
        identified_estimand.set_backdoor_variables(new_backdoor_variables)

        new_estimator = CausalEstimator.get_estimator_object(new_data, identified_estimand, self._estimate)
        new_effect = new_estimator.estimate_effect()
        refute = CausalRefutation(self._estimate.value, new_effect.value,
                                  refutation_type="Refute: Add a Random Common Cause")
        refute.add_refuter(self)
        return refute
示例#5
0
    def refute_estimate(self):
        num_rows = self._data.shape[0]
        if self._placebo_type == "permute":
            new_treatment = self._data[self._treatment_name].sample(
                frac=1).values
        else:
            new_treatment = np.random.randn(num_rows)
        new_data = self._data.assign(placebo=new_treatment)

        self.logger.debug(new_data[0:10])
        estimator_class = self._estimate.params['estimator_class']
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.treatment_variable = ["placebo"]

        new_estimator = estimator_class(new_data,
                                        identified_estimand,
                                        "placebo",
                                        self._outcome_name,
                                        test_significance=None)
        new_effect = new_estimator.estimate_effect()
        refute = CausalRefutation(
            self._estimate.value,
            new_effect.value,
            refutation_type="Refute: Use a Placebo Treatment")
        return refute
    def refute_estimate(self):

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info(
            "Refutation over {} simulated datasets of size {} each".format(
                self._subset_fraction,
                self._subset_fraction * len(self._data.index)))

        for index in range(self._num_simulations):
            if self._random_state is None:
                new_data = self._data.sample(frac=self._subset_fraction)
            else:
                new_data = self._data.sample(frac=self._subset_fraction,
                                             random_state=self._random_state)

            new_estimator = self.get_estimator_object(new_data,
                                                      self._target_estimand,
                                                      self._estimate)
            new_effect = new_estimator.estimate_effect()
            sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Use a subset of data")
        return refute
示例#7
0
 def refute_estimate(self):
     num_rows = self._data.shape[0]
     new_data = self._data.assign(w_random=np.random.randn(num_rows))
     self.logger.debug(new_data[0:10])
     new_backdoor_variables = self._target_estimand.backdoor_variables + [
         'w_random'
     ]
     estimator_class = self._estimate.params['estimator_class']
     identified_estimand = copy.deepcopy(self._target_estimand)
     identified_estimand.backdoor_variables = new_backdoor_variables
     #identified_estimand = IdentifiedEstimand(
     #        treatment_variable = self._treatment_name,
     #        outcome_variable = self._outcome_name,
     #        backdoor_variables = new_backdoor_variables)#self._target_estimand.backdoor_variables)#new_backdoor_variables)
     new_estimator = estimator_class(new_data,
                                     identified_estimand,
                                     self._treatment_name,
                                     self._outcome_name,
                                     test_significance=None)
     new_effect = new_estimator.estimate_effect()
     refute = CausalRefutation(
         self._estimate.value,
         new_effect.value,
         refutation_type="Refute: Add a Random Common Cause")
     return (refute)
示例#8
0
    def refute_estimate(self, *args, **kwargs):
        if self._sample_size > len(self._data):
            self.logger.warning(
                "The sample size is larger than the population size")

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info(
            "Refutation over {} simulated datasets of size {} each".format(
                self._num_simulations, self._sample_size))

        for index in range(self._num_simulations):
            if self._random_state is None:
                new_data = resample(self._data, n_samples=self._sample_size)
            else:
                new_data = resample(self._data,
                                    n_samples=self._sample_size,
                                    random_state=self._random_state)

            new_estimator = self.get_estimator_object(new_data,
                                                      self._target_estimand,
                                                      self._estimate)
            new_effect = new_estimator.estimate_effect()
            sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Bootstrap Sample Dataset")

        return refute
示例#9
0
    def refute_estimate(self):
        new_data = self._data.sample(frac=self._subset_fraction)

        new_estimator = self.get_estimator_object(new_data, self._target_estimand, self._estimate)
        new_effect = new_estimator.estimate_effect()

        refute = CausalRefutation(
            self._estimate.value,
            new_effect.value,
            refutation_type="Refute: Use a subset of data"
        )
        return refute
示例#10
0
    def refute_estimate(self):
        new_data = copy.deepcopy(self._data)
        new_data = self.include_confounders_effect(new_data)

        new_estimator = self.get_estimator_object(new_data,
                                                  self._target_estimand,
                                                  self._estimate)
        new_effect = new_estimator.estimate_effect()
        refute = CausalRefutation(
            self._estimate.value,
            new_effect.value,
            refutation_type="Refute: Add an Unobserved Common Cause")
        return refute
示例#11
0
    def refute_estimate(self, *args, **kwargs):
        if self._sample_size > len(self._data):
                self.logger.warning("The sample size is larger than the population size")

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info("Refutation over {} simulated datasets of size {} each"
                         .format(self._num_simulations
                         ,self._sample_size )
                        ) 
        
        for index in range(self._num_simulations):
            if self._random_state is None:
                new_data = resample(self._data, 
                                n_samples=self._sample_size )
            else:
                new_data = resample(self._data,
                                    n_samples=self._sample_size,
                                    random_state=self._random_state )

            new_estimator = self.get_estimator_object(new_data, self._target_estimand, self._estimate)
            new_effect = new_estimator.estimate_effect()
            sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Bootstrap Sample Dataset"
        )

        # We want to see if the estimate falls in the same distribution as the one generated by the refuter
        # Ideally that should be the case as bootstrapping should not have a significant effect on the ability
        # of the treatment to affect the outcome
        refute.add_significance_test_results(
            self.test_significance(self._estimate.value, sample_estimates)
        )

        return refute
示例#12
0
    def refute_estimate(self):
        new_data = copy.deepcopy(self._data)
        new_data = self.include_confounders_effect(new_data)

        estimator_class = self._estimate.params['estimator_class']
        new_estimator = estimator_class(new_data,
                                        self._target_estimand,
                                        self._treatment_name,
                                        self._outcome_name,
                                        test_significance=None)
        new_effect = new_estimator.estimate_effect()
        refute = CausalRefutation(
            self._estimate.value,
            new_effect.value,
            refutation_type="Refute: Add an Unobserved Common Cause")
        return refute
示例#13
0
    def refute_estimate(self):
        new_data = self._data.sample(frac=self._subset_fraction)

        estimator_class = self._estimate.params['estimator_class']
        identified_estimand = self._target_estimand
        new_estimator = estimator_class(new_data,
                                        identified_estimand,
                                        self._treatment_name,
                                        self._outcome_name,
                                        test_significance=None)
        new_effect = new_estimator.estimate_effect()
        refute = CausalRefutation(
            self._estimate.value,
            new_effect.value,
            refutation_type="Refute: Use a subset of data")
        return (refute)
    def refute_estimate(self):

        # We need to change the identified estimand
        # This is done as a safety measure, we don't want to change the
        # original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.treatment_variable = ["placebo"]

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info(
            "Refutation over {} simulated datasets of {} treatment".format(
                self._num_simulations, self._placebo_type))

        num_rows = self._data.shape[0]

        for index in range(self._num_simulations):

            if self._placebo_type == "permute":
                if self._random_state is None:
                    new_treatment = self._data[self._treatment_name].sample(
                        frac=1).values
                else:
                    new_treatment = self._data[self._treatment_name].sample(
                        frac=1, random_state=self._random_state).values
            else:
                new_treatment = np.random.randn(num_rows)

            # Create a new column in the data by the name of placebo
            new_data = self._data.assign(placebo=new_treatment)

            # Sanity check the data
            self.logger.debug(new_data[0:10])

            new_estimator = self.get_estimator_object(new_data,
                                                      identified_estimand,
                                                      self._estimate)
            new_effect = new_estimator.estimate_effect()
            sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Use a Placebo Treatment")
        return refute
示例#15
0
    def refute_estimate(self):
        num_rows = self._data.shape[0]
        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info("Refutation over {} simulated datasets, each with a random common cause added"
                         .format(self._num_simulations))

        new_backdoor_variables = self._target_estimand.get_backdoor_variables() + ['w_random']
        identified_estimand = copy.deepcopy(self._target_estimand)
        # Adding a new backdoor variable to the identified estimand
        identified_estimand.set_backdoor_variables(new_backdoor_variables)
        for index in range(self._num_simulations):
            if self._random_state is None:
                new_data = self._data.assign(w_random=np.random.randn(num_rows))
            else:
                new_data = self._data.assign(w_random=self._random_state.normal(size=num_rows
                                             ))

            new_estimator = CausalEstimator.get_estimator_object(new_data, identified_estimand, self._estimate)
            new_effect = new_estimator.estimate_effect()

            sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Add a random common cause"
        )

        # We want to see if the estimate falls in the same distribution as the one generated by the refuter
        # Ideally that should be the case as choosing a subset should not have a significant effect on the ability
        # of the treatment to affect the outcome
        refute.add_significance_test_results(
            self.test_significance(self._estimate, sample_estimates)
        )

        refute.add_refuter(self)
        return refute
示例#16
0
    def refute_estimate(self):

        # We need to change the identified estimand
        # We thus, make a copy. This is done as we don't want
        # to change the original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.outcome_variable = ["dummy_outcome"]

        self.logger.info("Refutation over {} simulated datasets".format(
            self._num_simulations))
        self.logger.info("The transformation passed: {}".format(
            self._transformation_list))

        simulation_results = []
        refute_list = []

        # We use collections.OrderedDict to maintain the order in which the data is stored
        causal_effect_map = OrderedDict()

        # Check if we are using an estimator in the transformation list
        estimator_present = self._has_estimator()

        # The rationale behind ordering of the loops is the fact that we induce randomness everytime we create the
        # Train and the Validation Datasets. Thus, we run the simulation loop followed by the training and the validation
        # loops. Thus, we can get different values everytime we get the estimator.

        for _ in range(self._num_simulations):
            estimates = []

            if estimator_present == False:

                # Warn the user that the specified parameter is not applicable when no estimator is present in the transformation
                if self._test_fraction != DummyOutcomeRefuter.DEFAULT_TEST_FRACTION:
                    self.logger.warning(
                        "'test_fraction' is not applicable as there is no base treatment value."
                    )

                # We set X_train = 0 and outcome_train to be 0
                if self._unobserved_confounder_values is not None:
                    self._data[
                        'simulated'] = self._unobserved_confounder_values
                    self._chosen_variables.append('simulated')

                validation_df = self._data
                X_train = None
                outcome_train = None
                X_validation_df = validation_df[self._chosen_variables]

                X_validation = X_validation_df.values
                outcome_validation = validation_df['y'].values

                # Get the final outcome, after running through all the values in the transformation list
                outcome_validation = self.process_data(
                    X_train, outcome_train, X_validation, outcome_validation,
                    self._transformation_list)

                # Check if the value of true effect has been already stored
                # We use None as the key as we have no base category for this refutation
                if None not in causal_effect_map:
                    # As we currently support only one treatment
                    causal_effect_map[None] = self._true_causal_effect(
                        validation_df[self._treatment_name[0]])

                outcome_validation += causal_effect_map[None]

                new_data = validation_df.assign(
                    dummy_outcome=outcome_validation)

                new_estimator = CausalEstimator.get_estimator_object(
                    new_data, identified_estimand, self._estimate)
                new_effect = new_estimator.estimate_effect()
                estimates.append(new_effect.value)

            else:

                groups = self.preprocess_data_by_treatment()
                group_count = 0

                if len(self._test_fraction) == 1:
                    self._test_fraction = len(groups) * self._test_fraction

                for key_train, _ in groups:
                    base_train = groups.get_group(key_train).sample(
                        frac=self._test_fraction[group_count].base)
                    train_set = set(
                        [tuple(line) for line in base_train.values])
                    total_set = set([
                        tuple(line)
                        for line in groups.get_group(key_train).values
                    ])
                    base_validation = pd.DataFrame(list(
                        total_set.difference(train_set)),
                                                   columns=base_train.columns)
                    X_train_df = base_train[self._chosen_variables]

                    X_train = X_train_df.values

                    outcome_train = base_train['y'].values

                    validation_df = []
                    transformation_list = self._transformation_list
                    validation_df.append(base_validation)

                    for key_validation, _ in groups:
                        if key_validation != key_train:
                            validation_df.append(
                                groups.get_group(key_validation).sample(
                                    frac=self._test_fraction[group_count].other
                                ))

                    validation_df = pd.concat(validation_df)
                    X_validation_df = validation_df[self._chosen_variables]

                    X_validation = X_validation_df.values
                    outcome_validation = validation_df['y'].values

                    # If the number of data points is too few, run the default transformation: [("zero",""),("noise", {'std_dev':1} )]
                    if X_train.shape[0] <= self._min_data_point_threshold:
                        transformation_list = DummyOutcomeRefuter.DEFAULT_TRANSFORMATION
                        self.logger.warning(
                            "The number of data points in X_train:{} for category:{} is less than threshold:{}"
                            .format(X_train.shape[0], key_train,
                                    self._min_data_point_threshold))
                        self.logger.warning(
                            "Therefore, defaulting to the minimal set of transformations:{}"
                            .format(transformation_list))

                    outcome_validation = self.process_data(
                        X_train, outcome_train, X_validation,
                        outcome_validation, transformation_list)

                    # Check if the value of true effect has been already stored
                    # This ensures that we calculate the causal effect only once.
                    # We use key_train as we map data with respect to the base category of the data

                    if key_train not in causal_effect_map:
                        # As we currently support only one treatment
                        causal_effect_map[
                            key_train] = self._true_causal_effect(
                                validation_df[self._treatment_name[0]])

                    # Add h(t) to f(W) to get the dummy outcome
                    outcome_validation += causal_effect_map[key_train]

                    new_data = validation_df.assign(
                        dummy_outcome=outcome_validation)
                    new_estimator = CausalEstimator.get_estimator_object(
                        new_data, identified_estimand, self._estimate)
                    new_effect = new_estimator.estimate_effect()

                    estimates.append(new_effect.value)
                    group_count += 1

            simulation_results.append(estimates)

        # We convert to ndarray for ease in indexing
        # The data is of the form
        # sim1: cat1 cat2 ... catn
        # sim2: cat1 cat2 ... catn
        simulation_results = np.array(simulation_results)

        # Note: We would like the causal_estimator to find the true causal estimate that we have specified through this
        # refuter. Let the value of the true causal effect be h(t). In the following section of code, we wish to find out if h(t) falls in the
        # distribution of the refuter.

        if estimator_present == False:

            dummy_estimate = CausalEstimate(
                estimate=causal_effect_map[None],
                target_estimand=self._estimate.target_estimand,
                realized_estimand_expr=self._estimate.realized_estimand_expr)

            refute = CausalRefutation(
                dummy_estimate.value,
                np.mean(simulation_results),
                refutation_type="Refute: Use a Dummy Outcome")

            refute.add_significance_test_results(
                self.test_significance(dummy_estimate,
                                       np.ravel(simulation_results)))

            refute.add_refuter(self)

            refute_list.append(refute)

        else:
            # True Causal Effect list
            causal_effect_list = list(causal_effect_map.values())
            # Iterating through the refutation for each category
            for train_category in range(simulation_results.shape[1]):
                dummy_estimate = CausalEstimate(
                    estimate=causal_effect_list[train_category],
                    target_estimand=self._estimate.target_estimand,
                    realized_estimand_expr=self._estimate.
                    realized_estimand_expr)

                refute = CausalRefutation(
                    dummy_estimate.value,
                    np.mean(simulation_results[:, train_category]),
                    refutation_type="Refute: Use a Dummy Outcome")

                refute.add_significance_test_results(
                    self.test_significance(
                        dummy_estimate, simulation_results[:, train_category]))

                refute.add_refuter(self)
                refute_list.append(refute)

        return refute_list
示例#17
0
    def refute_estimate(self):

        # We need to change the identified estimand
        # We thus, make a copy. This is done as we don't want
        # to change the original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.outcome_variable = ["dummy_outcome"]

        self.logger.info("Refutation over {} simulated datasets".format(
            self._num_simulations))
        self.logger.info("The transformation passed: {}".format(
            self._transformation_list))

        simulation_results = []
        refute_list = []
        no_estimator = self.check_for_estimator()

        for _ in range(self._num_simulations):
            estimates = []
            if no_estimator:
                # We set X_train = 0 and outcome_train to be 0
                validation_df = self._data
                X_train = None
                outcome_train = None
                X_validation = validation_df[self._chosen_variables].values
                outcome_validation = validation_df['y'].values

                # Get the final outcome, after running through all the values in the transformation list
                outcome_validation = self.process_data(
                    X_train, outcome_train, X_validation, outcome_validation,
                    self._transformation_list)

            else:
                groups = self.preprocess_data_by_treatment()
                for key_train, _ in groups:
                    X_train = groups.get_group(key_train)[
                        self._chosen_variables].values
                    outcome_train = groups.get_group(key_train)['y'].values
                    validation_df = []
                    transformation_list = self._transformation_list

                    for key_validation, _ in groups:
                        if key_validation != key_train:
                            validation_df.append(
                                groups.get_group(key_validation))

                    validation_df = pd.concat(validation_df)
                    X_validation = validation_df[self._chosen_variables].values
                    outcome_validation = validation_df['y'].values

                    # If the number of data points is too few, run the default transformation: [("zero",""),("noise", {'std_dev':1} )]
                    if X_train.shape[0] <= self._min_data_point_threshold:
                        transformation_list = DummyOutcomeRefuter.DEFAULT_TRANSFORMATION

                    outcome_validation = self.process_data(
                        X_train, outcome_train, X_validation,
                        outcome_validation, transformation_list)

            new_data = validation_df.assign(dummy_outcome=outcome_validation)
            new_estimator = CausalEstimator.get_estimator_object(
                new_data, identified_estimand, self._estimate)
            new_effect = new_estimator.estimate_effect()
            estimates.append(new_effect.value)

        simulation_results.append(estimates)

        # We convert to ndarray for ease in indexing
        # The data is of the form
        # sim1: cat1 cat2 ... catn
        # sim2: cat1 cat2 ... catn
        simulation_results = np.array(simulation_results)

        # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter
        # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal
        # relationship between the treatment and the outcome.
        dummy_estimator = CausalEstimate(
            estimate=0,
            target_estimand=self._estimate.target_estimand,
            realized_estimand_expr=self._estimate.realized_estimand_expr)

        if no_estimator:
            refute = CausalRefutation(
                self._estimate.value,
                np.mean(simulation_results),
                refutation_type="Refute: Use a Dummy Outcome")

            refute.add_significance_test_results(
                self.test_significance(dummy_estimator, simulation_results))

            refute_list.append(refute)

        else:
            for category in simulation_results.shape[1]:
                refute = CausalRefutation(
                    self._estimate.value,
                    np.mean(simulation_results[:, category]),
                    refutation_type="Refute: Use a Dummy Outcome")

                refute.add_significance_test_results(
                    self.test_significance(dummy_estimator,
                                           simulation_results[:, category]))

                refute_list.append(refute)

        return refute_list
示例#18
0
    def refute_estimate(self):

        if not isinstance(self.kappa_t, np.ndarray) and not isinstance(
                self.kappa_y, np.ndarray):  # Deal with single value inputs
            new_data = copy.deepcopy(self._data)
            new_data = self.include_confounders_effect(new_data, self.kappa_t,
                                                       self.kappa_y)

            new_estimator = self.get_estimator_object(new_data,
                                                      self._target_estimand,
                                                      self._estimate)
            new_effect = new_estimator.estimate_effect()
            refute = CausalRefutation(
                self._estimate.value,
                new_effect.value,
                refutation_type="Refute: Add an Unobserved Common Cause")
            return refute

        else:  # Deal with multiple value inputs

            if isinstance(self.kappa_t, np.ndarray) and isinstance(
                    self.kappa_y, np.ndarray):  # Deal with range inputs

                # Get a 2D matrix of values
                x, y = np.meshgrid(self.kappa_t,
                                   self.kappa_y)  # x,y are both MxN

                results_matrix = np.random.rand(
                    len(x), len(y))  # Matrix to hold all the results of NxM
                print(results_matrix.shape)
                orig_data = copy.deepcopy(self._data)

                for i in range(0, len(x[0])):
                    for j in range(0, len(y)):
                        new_data = self.include_confounders_effect(
                            orig_data, x[0][i], y[j][0])
                        new_estimator = self.get_estimator_object(
                            new_data, self._target_estimand, self._estimate)
                        new_effect = new_estimator.estimate_effect()
                        refute = CausalRefutation(
                            self._estimate.value,
                            new_effect.value,
                            refutation_type=
                            "Refute: Add an Unobserved Common Cause")
                        logging.debug(refute)
                        results_matrix[i][j] = refute.estimated_effect[
                            0]  # Populate the results

                fig = plt.figure(figsize=(6, 5))
                left, bottom, width, height = 0.1, 0.1, 0.8, 0.8
                ax = fig.add_axes([left, bottom, width, height])

                cp = plt.contourf(x, y, results_matrix)
                plt.colorbar(cp)
                ax.set_title('Effect of Unobserved Common Cause')
                ax.set_xlabel('Value of Linear Constant on Treatment')
                ax.set_ylabel('Value of Linear Constant on Outcome')
                plt.show()
                return results_matrix

            elif isinstance(self.kappa_t, np.ndarray):
                outcomes = np.random.rand(len(self.kappa_t))
                orig_data = copy.deepcopy(self._data)

                for i in range(0, len(self.kappa_t)):
                    new_data = self.include_confounders_effect(
                        orig_data, self.kappa_t[i], self.kappa_y)
                    new_estimator = self.get_estimator_object(
                        new_data, self._target_estimand, self._estimate)
                    new_effect = new_estimator.estimate_effect()
                    refute = CausalRefutation(
                        self._estimate.value,
                        new_effect.value,
                        refutation_type="Refute: Add an Unobserved Common Cause"
                    )
                    logging.debug(refute)
                    outcomes[i] = refute.estimated_effect[
                        0]  # Populate the results

                fig = plt.figure(figsize=(6, 5))
                left, bottom, width, height = 0.1, 0.1, 0.8, 0.8
                ax = fig.add_axes([left, bottom, width, height])

                plt.plot(self.kappa_t, outcomes)
                ax.set_title('Effect of Unobserved Common Cause')
                ax.set_xlabel('Value of Linear Constant on Treatment')
                ax.set_ylabel('New Effect')
                plt.show()
                return outcomes

            elif isinstance(self.kappa_y, np.ndarray):
                outcomes = np.random.rand(len(self.kappa_y))
                orig_data = copy.deepcopy(self._data)

                for i in range(0, len(self.kappa_y)):
                    new_data = self.include_confounders_effect(
                        orig_data, self.kappa_t, self.kappa_y[i])
                    new_estimator = self.get_estimator_object(
                        new_data, self._target_estimand, self._estimate)
                    new_effect = new_estimator.estimate_effect()
                    refute = CausalRefutation(
                        self._estimate.value,
                        new_effect.value,
                        refutation_type="Refute: Add an Unobserved Common Cause"
                    )
                    logging.debug(refute)
                    outcomes[i] = refute.estimated_effect[
                        0]  # Populate the results

                fig = plt.figure(figsize=(6, 5))
                left, bottom, width, height = 0.1, 0.1, 0.8, 0.8
                ax = fig.add_axes([left, bottom, width, height])

                plt.plot(self.kappa_y, outcomes)
                ax.set_title('Effect of Unobserved Common Cause')
                ax.set_xlabel('Value of Linear Constant on Outcome')
                ax.set_ylabel('New Effect')
                plt.show()
                return outcomes
示例#19
0
    def refute_estimate(self):

        # We need to change the identified estimand
        # We thus, make a copy. This is done as we don't want
        # to change the original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.outcome_variable = ["dummy_outcome"]

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info("Refutation over {} simulated datasets".format(
            self._num_simulations))
        self.logger.info("The transformation passed: {}",
                         self._transformations)

        # This flag is to make sure we store the estimators whose input is deterministic
        save_estimators = True
        # We store the value of the estimators in the format "estimator_name" +  "pos_in_transform" : estimator_object
        saved_estimator_dict = {}

        X = self._data[self._chosen_variables]
        new_outcome = self._data['y']

        for index in range(self._num_simulations):
            transform_num = 0
            for action, func_args in self._transformations:

                if callable(action):
                    new_outcome = action(X, **func_args)

                elif action in DummyOutcomeRefuter.SUPPORTED_ESTIMATORS:
                    if action + str(transform_num) in saved_estimator_dict:
                        estimator = saved_estimator_dict[action +
                                                         str(transform_num)]
                        new_outcome = estimator(X)
                    else:
                        estimator = self._estimate_dummy_outcome(
                            func_args, action, new_outcome)
                        new_outcome = estimator(X)
                        if save_estimators:
                            saved_estimator_dict[
                                action + str(transform_num)] = estimator

                elif action == 'noise':
                    save_estimators = False
                    new_outcome = self._noise(new_outcome, func_args)

                elif action == 'permute':
                    save_estimators = False
                    new_outcome = self._permute(new_outcome, func_args)

                elif action == 'zero':
                    save_estimators = False
                    new_outcome = np.zeros(new_outcome.shape)

                transform_num += 1

            save_estimators = False

        # Create a new column in the data by the name of dummy_outcome

        new_data = self._data.assign(dummy_outcome=new_outcome)

        # Sanity check the data
        self.logger.debug(new_data[0:10])

        new_estimator = CausalEstimator.get_estimator_object(
            new_data, identified_estimand, self._estimate)
        new_effect = new_estimator.estimate_effect()
        sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Use a Dummy Outcome")

        # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter
        # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal
        # relationship between the treatment and the outcome.

        dummy_estimator = copy.deepcopy(self._estimate)
        dummy_estimator.value = 0

        refute.add_significance_test_results(
            self.test_significance(dummy_estimator, sample_estimates))

        return refute
    def refute_estimate(self):
        """
        This function attempts to add an unobserved common cause to the outcome and the treatment. At present, we have implemented the behavior for one dimensional behaviors for continueous
        and binary variables. This function can either take single valued inputs or a range of inputs. The function then looks at the data type of the input and then decides on the course of
        action.

        :return: CausalRefuter: An object that contains the estimated effect and a new effect and the name of the refutation used.
        """
        if not isinstance(self.kappa_t, np.ndarray) and not isinstance(self.kappa_y, np.ndarray): # Deal with single value inputs
            new_data = copy.deepcopy(self._data)
            new_data = self.include_confounders_effect(new_data, self.kappa_t, self.kappa_y)
            new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate)
            new_effect = new_estimator.estimate_effect()
            refute = CausalRefutation(self._estimate.value, new_effect.value,
                                    refutation_type="Refute: Add an Unobserved Common Cause")

            refute.new_effect = np.array(new_effect.value)
            refute.add_refuter(self)
            return refute

        else: # Deal with multiple value inputs

            if isinstance(self.kappa_t, np.ndarray) and isinstance(self.kappa_y, np.ndarray): # Deal with range inputs
                # Get a 2D matrix of values
                x,y =  np.meshgrid(self.kappa_t, self.kappa_y) # x,y are both MxN

                results_matrix = np.random.rand(len(x),len(y)) # Matrix to hold all the results of NxM
                print(results_matrix.shape)
                orig_data = copy.deepcopy(self._data)

                for i in range(0,len(x[0])):
                    for j in range(0,len(y)):
                        new_data = self.include_confounders_effect(orig_data, x[0][i], y[j][0])
                        new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate)
                        new_effect = new_estimator.estimate_effect()
                        refute = CausalRefutation(self._estimate.value, new_effect.value,
                                                refutation_type="Refute: Add an Unobserved Common Cause")
                        self.logger.debug(refute)
                        results_matrix[i][j] = refute.estimated_effect # Populate the results

                import matplotlib
                import matplotlib.pyplot as plt
                fig = plt.figure(figsize=(6,5))
                left, bottom, width, height = 0.1, 0.1, 0.8, 0.8
                ax = fig.add_axes([left, bottom, width, height])

                cp = plt.contourf(x, y, results_matrix)
                plt.colorbar(cp)
                ax.set_title('Effect of Unobserved Common Cause')
                ax.set_xlabel('Value of Linear Constant on Treatment')
                ax.set_ylabel('Value of Linear Constant on Outcome')
                plt.show()

                refute.new_effect = results_matrix
                # Store the values into the refute object
                refute.add_refuter(self)
                return refute

            elif isinstance(self.kappa_t, np.ndarray):
                outcomes = np.random.rand(len(self.kappa_t))
                orig_data = copy.deepcopy(self._data)

                for i in range(0,len(self.kappa_t)):
                    new_data = self.include_confounders_effect(orig_data, self.kappa_t[i], self.kappa_y)
                    new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate)
                    new_effect = new_estimator.estimate_effect()
                    refute = CausalRefutation(self._estimate.value, new_effect.value,
                                            refutation_type="Refute: Add an Unobserved Common Cause")
                    self.logger.debug(refute)
                    outcomes[i] = refute.estimated_effect # Populate the results

                import matplotlib
                import matplotlib.pyplot as plt
                fig = plt.figure(figsize=(6,5))
                left, bottom, width, height = 0.1, 0.1, 0.8, 0.8
                ax = fig.add_axes([left, bottom, width, height])

                plt.plot(self.kappa_t, outcomes)
                ax.set_title('Effect of Unobserved Common Cause')
                ax.set_xlabel('Value of Linear Constant on Treatment')
                ax.set_ylabel('New Effect')
                plt.show()

                refute.new_effect = outcomes
                refute.add_refuter(self)
                return refute

            elif isinstance(self.kappa_y, np.ndarray):
                outcomes = np.random.rand(len(self.kappa_y))
                orig_data = copy.deepcopy(self._data)

                for i in range(0, len(self.kappa_y)):
                    new_data = self.include_confounders_effect(orig_data, self.kappa_t, self.kappa_y[i])
                    new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate)
                    new_effect = new_estimator.estimate_effect()
                    refute = CausalRefutation(self._estimate.value, new_effect.value,
                                            refutation_type="Refute: Add an Unobserved Common Cause")
                    self.logger.debug(refute)
                    outcomes[i] = refute.estimated_effect # Populate the results

                import matplotlib
                import matplotlib.pyplot as plt
                fig = plt.figure(figsize=(6,5))
                left, bottom, width, height = 0.1, 0.1, 0.8, 0.8
                ax = fig.add_axes([left, bottom, width, height])

                plt.plot(self.kappa_y, outcomes)
                ax.set_title('Effect of Unobserved Common Cause')
                ax.set_xlabel('Value of Linear Constant on Outcome')
                ax.set_ylabel('New Effect')
                plt.show()

                refute.new_effect = outcomes
                refute.add_refuter(self)
                return refute
    def refute_estimate(self):
        # only permute is supported for iv methods
        if self._target_estimand.identifier_method.startswith("iv"):
            if self._placebo_type != "permute":
                self.logger.error(
                    "Only placebo_type=''permute'' is supported for creating placebo for instrumental variable estimation methods"
                )
                raise ValueError(
                    "Only placebo_type=''permute'' is supported for creating placebo for instrumental variable estimation methods."
                )

        # We need to change the identified estimand
        # We make a copy as a safety measure, we don't want to change the
        # original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.treatment_variable = ["placebo"]
        if self._target_estimand.identifier_method.startswith("iv"):
            identified_estimand.instrumental_variables = [
                "placebo_" + s
                for s in identified_estimand.instrumental_variables
            ]
            # For IV methods, the estimating_instrument_names should also be
            # changed. So we change it inside the estimate and then restore it
            # back at the end of this method.
            if self._estimate.params[
                    "method_params"] is not None and "iv_instrument_name" in self._estimate.params[
                        "method_params"]:
                self._estimate.params["method_params"]["iv_instrument_name"] = \
                ["placebo_" + s for s in parse_state(self._estimate.params["method_params"]["iv_instrument_name"])]

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info(
            "Refutation over {} simulated datasets of {} treatment".format(
                self._num_simulations, self._placebo_type))

        num_rows = self._data.shape[0]
        treatment_name = self._treatment_name[
            0]  # Extract the name of the treatment variable
        type_dict = dict(self._data.dtypes)

        for index in range(self._num_simulations):

            if self._placebo_type == "permute":
                permuted_idx = None
                if self._random_state is None:
                    permuted_idx = np.random.choice(self._data.shape[0],
                                                    size=self._data.shape[0],
                                                    replace=False)

                else:
                    permuted_idx = self._random_state.choice(
                        self._data.shape[0],
                        size=self._data.shape[0],
                        replace=False)
                new_treatment = self._data[
                    self._treatment_name].iloc[permuted_idx].values
                if self._target_estimand.identifier_method.startswith("iv"):
                    new_instruments_values = self._data[
                        self._estimate.estimator.
                        estimating_instrument_names].iloc[permuted_idx].values
                    new_instruments_df = pd.DataFrame(
                        new_instruments_values,
                        columns=[
                            "placebo_" + s for s in
                            self._data[self._estimate.estimator.
                                       estimating_instrument_names].columns
                        ])
            else:
                if 'float' in type_dict[treatment_name].name:
                    self.logger.info(
                        "Using a Normal Distribution with Mean:{} and Variance:{}"
                        .format(
                            PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL,
                            PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL))
                    new_treatment = np.random.randn(num_rows)*PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL + \
                                    PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL

                elif 'bool' in type_dict[treatment_name].name:
                    self.logger.info(
                        "Using a Binomial Distribution with {} trials and {} probability of success"
                        .format(
                            PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS,
                            PlaceboTreatmentRefuter.
                            DEFAULT_PROBABILITY_OF_BINOMIAL))
                    new_treatment = np.random.binomial(
                        PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS,
                        PlaceboTreatmentRefuter.
                        DEFAULT_PROBABILITY_OF_BINOMIAL, num_rows).astype(bool)

                elif 'int' in type_dict[treatment_name].name:
                    self.logger.info(
                        "Using a Discrete Uniform Distribution lying between {} and {}"
                        .format(self._data[treatment_name].min(),
                                self._data[treatment_name].max()))
                    new_treatment = np.random.randint(
                        low=self._data[treatment_name].min(),
                        high=self._data[treatment_name].max(),
                        size=num_rows)

                elif 'category' in type_dict[treatment_name].name:
                    categories = self._data[treatment_name].unique()
                    self.logger.info(
                        "Using a Discrete Uniform Distribution with the following categories:{}"
                        .format(categories))
                    sample = np.random.choice(categories, size=num_rows)
                    new_treatment = pd.Series(sample).astype('category')

            # Create a new column in the data by the name of placebo
            new_data = self._data.assign(placebo=new_treatment)
            if self._target_estimand.identifier_method.startswith("iv"):
                new_data = pd.concat((new_data, new_instruments_df), axis=1)
            # Sanity check the data
            self.logger.debug(new_data[0:10])
            new_estimator = CausalEstimator.get_estimator_object(
                new_data, identified_estimand, self._estimate)
            new_effect = new_estimator.estimate_effect()
            sample_estimates[index] = new_effect.value

        # Restoring the value of iv_instrument_name
        if self._target_estimand.identifier_method.startswith("iv"):
            if self._estimate.params[
                    "method_params"] is not None and "iv_instrument_name" in self._estimate.params[
                        "method_params"]:
                self._estimate.params["method_params"]["iv_instrument_name"] = \
                [s.replace("placebo_","",1) for s in parse_state(self._estimate.params["method_params"]["iv_instrument_name"])]
        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Use a Placebo Treatment")

        # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter
        # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal
        # relationship between the treatment and the outcome.
        dummy_estimator = CausalEstimate(
            estimate=0,
            control_value=self._estimate.control_value,
            treatment_value=self._estimate.treatment_value,
            target_estimand=self._estimate.target_estimand,
            realized_estimand_expr=self._estimate.realized_estimand_expr)

        refute.add_significance_test_results(
            self.test_significance(dummy_estimator, sample_estimates))
        refute.add_refuter(self)
        return refute
示例#22
0
    def refute_estimate(self):

        # We need to change the identified estimand
        # We thus, make a copy. This is done as we don't want
        # to change the original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.outcome_variable = ["dummy_outcome"]

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info(
            "Refutation over {} simulated datasets of {} treatment".format(
                self._num_simulations, self._dummy_outcome_type))
        num_rows = self._data.shape[0]

        for index in range(self._num_simulations):

            if self._dummy_outcome_type == "permute":
                if self._random_state is None:
                    new_outcome = self._data[self._outcome_name].sample(
                        frac=1).values
                else:
                    new_outcome = self._data[self._outcome_name].sample(
                        frac=1, random_state=self._random_state).values
            elif self._outcome_function is not None:
                new_outcome = self._outcome_function(self._data)

                if type(new_outcome) is pd.Series or \
                   type(new_outcome) is pd.DataFrame:
                    new_outcome = new_outcome.values

                # Check if data types match
                assert type(new_outcome) is np.ndarray, (
                    "Only  supports numpy.ndarray as the output")
                assert 'float' in new_outcome.dtype.name, (
                    "Only float outcomes are currently supported")

                if len(new_outcome.shape) == 2 and \
                    ( new_outcome.shape[0] ==1 or new_outcome.shape[1] ):
                    self.logger.warning(
                        "Converting the row or column vector to 1D array")
                    new_outcome = new_outcome.ravel()
                    assert len(new_outcome) == num_rows, (
                        "The number of outputs do not match that of the number of outcomes"
                    )
                elif len(new_outcome.shape) == 1:
                    assert len(new_outcome) == num_rows, (
                        "The number of outputs do not match that of the number of outcomes"
                    )
                else:
                    raise Exception(
                        "Type Mismatch: The outcome is one dimensional, but the output has the shape:{}"
                        .format(new_outcome.shape))
            else:
                new_outcome = np.random.randn(num_rows)

        # Create a new column in the data by the name of dummy_outcome
        new_data = self._data.assign(dummy_outcome=new_outcome)

        # Sanity check the data
        self.logger.debug(new_data[0:10])

        new_estimator = CausalEstimator.get_estimator_object(
            new_data, identified_estimand, self._estimate)
        new_effect = new_estimator.estimate_effect()
        sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Use a Dummy Outcome")

        # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter
        # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal
        # relationship between the treatment and the outcome.

        dummy_estimator = copy.deepcopy(self._estimate)
        dummy_estimator.value = 0

        refute.add_significance_test_results(
            self.test_significance(dummy_estimator, sample_estimates))

        return refute
    def refute_estimate(self):

        # We need to change the identified estimand
        # We make a copy as a safety measure, we don't want to change the
        # original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.treatment_variable = ["placebo"]

        sample_estimates = np.zeros(self._num_simulations)
        self.logger.info(
            "Refutation over {} simulated datasets of {} treatment".format(
                self._num_simulations, self._placebo_type))

        num_rows = self._data.shape[0]
        treatment_name = self._treatment_name[
            0]  # Extract the name of the treatment variable
        type_dict = dict(self._data.dtypes)

        for index in range(self._num_simulations):

            if self._placebo_type == "permute":
                if self._random_state is None:
                    new_treatment = self._data[self._treatment_name].sample(
                        frac=1).values
                else:
                    new_treatment = self._data[self._treatment_name].sample(
                        frac=1, random_state=self._random_state).values
            else:
                if 'float' in type_dict[treatment_name].name:
                    self.logger.info(
                        "Using a Normal Distribution with Mean:{} and Variance:{}"
                        .format(
                            PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL,
                            PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL))
                    new_treatment = np.random.randn(num_rows)*PlaceboTreatmentRefuter.DEFAULT_STD_DEV_OF_NORMAL + \
                                    PlaceboTreatmentRefuter.DEFAULT_MEAN_OF_NORMAL

                elif 'bool' in type_dict[treatment_name].name:
                    self.logger.info(
                        "Using a Binomial Distribution with {} trials and {} probability of success"
                        .format(
                            PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS,
                            PlaceboTreatmentRefuter.
                            DEFAULT_PROBABILITY_OF_BINOMIAL))
                    new_treatment = np.random.binomial(
                        PlaceboTreatmentRefuter.DEFAULT_NUMBER_OF_TRIALS,
                        PlaceboTreatmentRefuter.
                        DEFAULT_PROBABILITY_OF_BINOMIAL, num_rows).astype(bool)

                elif 'int' in type_dict[treatment_name].name:
                    self.logger.info(
                        "Using a Discrete Uniform Distribution lying between {} and {}"
                        .format(self._data[treatment_name].min(),
                                self._data[treatment_name].max()))
                    new_treatment = np.random.randint(
                        low=self._data[treatment_name].min(),
                        high=self._data[treatment_name].max(),
                        size=num_rows)

                elif 'category' in type_dict[treatment_name].name:
                    categories = self._data[treatment_name].unique()
                    self.logger.info(
                        "Using a Discrete Uniform Distribution with the following categories:{}"
                        .format(categories))
                    sample = np.random.choice(categories, size=num_rows)
                    new_treatment = pd.Series(sample).astype('category')

            # Create a new column in the data by the name of placebo
            new_data = self._data.assign(placebo=new_treatment)

            # Sanity check the data
            self.logger.debug(new_data[0:10])

            new_estimator = self.get_estimator_object(new_data,
                                                      identified_estimand,
                                                      self._estimate)
            new_effect = new_estimator.estimate_effect()
            sample_estimates[index] = new_effect.value

        refute = CausalRefutation(
            self._estimate.value,
            np.mean(sample_estimates),
            refutation_type="Refute: Use a Placebo Treatment")

        # Note: We hardcode the estimate value to ZERO as we want to check if it falls in the distribution of the refuter
        # Ideally we should expect that ZERO should fall in the distribution of the effect estimates as we have severed any causal
        # relationship between the treatment and the outcome.

        dummy_estimator = copy.deepcopy(self._estimate)
        dummy_estimator.value = 0

        refute.add_significance_test_results(
            self.test_significance(dummy_estimator, sample_estimates))

        return refute
示例#24
0
    def refute_estimate(self):
        """
        This function attempts to add an unobserved common cause to the outcome and the treatment. At present, we have implemented the behavior for one dimensional behaviors for continuous
        and binary variables. This function can either take single valued inputs or a range of inputs. The function then looks at the data type of the input and then decides on the course of
        action.

        :return: CausalRefuter: An object that contains the estimated effect and a new effect and the name of the refutation used.
        """
        if self.kappa_t is None:
            self.kappa_t = self.infer_default_kappa_t()
        if self.kappa_y is None:
            self.kappa_y = self.infer_default_kappa_y()
        if not isinstance(self.kappa_t, (list, np.ndarray)) and not isinstance(
                self.kappa_y,
            (list, np.ndarray)):  # Deal with single value inputs
            new_data = copy.deepcopy(self._data)
            new_data = self.include_confounders_effect(new_data, self.kappa_t,
                                                       self.kappa_y)
            new_estimator = CausalEstimator.get_estimator_object(
                new_data, self._target_estimand, self._estimate)
            new_effect = new_estimator.estimate_effect()
            refute = CausalRefutation(
                self._estimate.value,
                new_effect.value,
                refutation_type="Refute: Add an Unobserved Common Cause")

            refute.new_effect_array = np.array(new_effect.value)
            refute.new_effect = new_effect.value
            refute.add_refuter(self)
            return refute

        else:  # Deal with multiple value inputs

            if isinstance(self.kappa_t, (list, np.ndarray)) and isinstance(
                    self.kappa_y,
                (list, np.ndarray)):  # Deal with range inputs
                # Get a 2D matrix of values
                #x,y =  np.meshgrid(self.kappa_t, self.kappa_y) # x,y are both MxN

                results_matrix = np.random.rand(
                    len(self.kappa_t),
                    len(self.kappa_y))  # Matrix to hold all the results of NxM
                orig_data = copy.deepcopy(self._data)
                for i in range(len(self.kappa_t)):
                    for j in range(len(self.kappa_y)):
                        new_data = self.include_confounders_effect(
                            orig_data, self.kappa_t[i], self.kappa_y[j])
                        new_estimator = CausalEstimator.get_estimator_object(
                            new_data, self._target_estimand, self._estimate)
                        new_effect = new_estimator.estimate_effect()
                        refute = CausalRefutation(
                            self._estimate.value,
                            new_effect.value,
                            refutation_type=
                            "Refute: Add an Unobserved Common Cause")
                        results_matrix[i][
                            j] = refute.new_effect  # Populate the results

                refute.new_effect_array = results_matrix
                refute.new_effect = (np.min(results_matrix),
                                     np.max(results_matrix))
                # Store the values into the refute object
                refute.add_refuter(self)
                if self.plotmethod is None:
                    return refute

                import matplotlib
                import matplotlib.pyplot as plt
                fig = plt.figure(figsize=(6, 5))
                left, bottom, width, height = 0.1, 0.1, 0.8, 0.8
                ax = fig.add_axes([left, bottom, width, height])

                oe = self._estimate.value
                contour_levels = [oe / 4.0, oe / 2.0, (3.0 / 4) * oe, oe]
                contour_levels.extend(
                    [0, np.min(results_matrix),
                     np.max(results_matrix)])
                if self.plotmethod == "contour":
                    cp = plt.contourf(self.kappa_y,
                                      self.kappa_t,
                                      results_matrix,
                                      levels=sorted(contour_levels))
                    # Adding a label on the contour line for the original estimate
                    fmt = {}
                    trueeffect_index = np.where(cp.levels == oe)[0][0]
                    fmt[cp.levels[trueeffect_index]] = "Estimated Effect"
                    # Label every other level using strings
                    plt.clabel(cp, [cp.levels[trueeffect_index]],
                               inline=True,
                               fmt=fmt)
                    plt.colorbar(cp)
                elif self.plotmethod == "colormesh":
                    cp = plt.pcolormesh(self.kappa_y,
                                        self.kappa_t,
                                        results_matrix,
                                        shading="nearest")
                    plt.colorbar(cp, ticks=contour_levels)
                ax.yaxis.set_ticks(self.kappa_t)
                ax.xaxis.set_ticks(self.kappa_y)
                plt.xticks(rotation=45)
                ax.set_title('Effect of Unobserved Common Cause')
                ax.set_ylabel('Value of Linear Constant on Treatment')
                ax.set_xlabel('Value of Linear Constant on Outcome')
                plt.show()

                return refute

            elif isinstance(self.kappa_t, (list, np.ndarray)):
                outcomes = np.random.rand(len(self.kappa_t))
                orig_data = copy.deepcopy(self._data)

                for i in range(0, len(self.kappa_t)):
                    new_data = self.include_confounders_effect(
                        orig_data, self.kappa_t[i], self.kappa_y)
                    new_estimator = CausalEstimator.get_estimator_object(
                        new_data, self._target_estimand, self._estimate)
                    new_effect = new_estimator.estimate_effect()
                    refute = CausalRefutation(
                        self._estimate.value,
                        new_effect.value,
                        refutation_type="Refute: Add an Unobserved Common Cause"
                    )
                    self.logger.debug(refute)
                    outcomes[i] = refute.new_effect  # Populate the results

                refute.new_effect_array = outcomes
                refute.new_effect = (np.min(outcomes), np.max(outcomes))
                refute.add_refuter(self)
                if self.plotmethod is None:
                    return refute

                import matplotlib
                import matplotlib.pyplot as plt
                fig = plt.figure(figsize=(6, 5))
                left, bottom, width, height = 0.1, 0.1, 0.8, 0.8
                ax = fig.add_axes([left, bottom, width, height])

                plt.plot(self.kappa_t, outcomes)
                plt.axhline(self._estimate.value, linestyle='--', color="gray")
                ax.set_title('Effect of Unobserved Common Cause')
                ax.set_xlabel('Value of Linear Constant on Treatment')
                ax.set_ylabel('Estimated Effect after adding the common cause')
                plt.show()

                return refute

            elif isinstance(self.kappa_y, (list, np.ndarray)):
                outcomes = np.random.rand(len(self.kappa_y))
                orig_data = copy.deepcopy(self._data)

                for i in range(0, len(self.kappa_y)):
                    new_data = self.include_confounders_effect(
                        orig_data, self.kappa_t, self.kappa_y[i])
                    new_estimator = CausalEstimator.get_estimator_object(
                        new_data, self._target_estimand, self._estimate)
                    new_effect = new_estimator.estimate_effect()
                    refute = CausalRefutation(
                        self._estimate.value,
                        new_effect.value,
                        refutation_type="Refute: Add an Unobserved Common Cause"
                    )
                    self.logger.debug(refute)
                    outcomes[i] = refute.new_effect  # Populate the results

                refute.new_effect_array = outcomes
                refute.new_effect = (np.min(outcomes), np.max(outcomes))
                refute.add_refuter(self)
                if self.plotmethod is None:
                    return refute

                import matplotlib
                import matplotlib.pyplot as plt
                fig = plt.figure(figsize=(6, 5))
                left, bottom, width, height = 0.1, 0.1, 0.8, 0.8
                ax = fig.add_axes([left, bottom, width, height])

                plt.plot(self.kappa_y, outcomes)
                plt.axhline(self._estimate.value, linestyle='--', color="gray")
                ax.set_title('Effect of Unobserved Common Cause')
                ax.set_xlabel('Value of Linear Constant on Outcome')
                ax.set_ylabel('Estimated Effect after adding the common cause')
                plt.show()

                return refute