Exemplo n.º 1
0
	def test__chi_square__not_providing_data_fails(self):
		"""
    	Value error raised when not providing data.
    	"""
		# Check if error is raised for None data
		with self.assertRaises(ValueError):
			statx.chi_square(self.samples.temperature, None)
		# Check if error is raised for None data
		with self.assertRaises(ValueError):
			statx.chi_square(None, self.samples.temperature)
Exemplo n.º 2
0
	def test__chi_square__computation_one_bin_not_present(self):
		"""
    	Check if p-value is correct for test data from pandas manual page.
    	"""
		# Create test data:
		a = ['A'] * 16 + ['B'] * 18 + ['C'] * 16 + ['D'] * 14 + ['E'] * 12 + ['F'] * 12
		b = ['A'] * 16 + ['B'] * 16 + ['C'] * 16 + ['D'] * 16 + ['E'] * 16 + ['F'] * 8
		# Computation of chi-square p-value (a is shortened)
		self.assertAlmostEqual(0.94879980715092971, statx.chi_square(a[0:-12], b)[0])
		# Computation of chi-square p-value (b is shortened)
		self.assertAlmostEqual(0.94879980715092971, statx.chi_square(a, b[0:-8])[0])
Exemplo n.º 3
0
	def test__chi_square__computation_symmetric(self):
		"""
    	Check if p-value is roughly symmetric.
    	"""
		# Create test data:
		a = ['A'] * 16 + ['B'] * 18 + ['C'] * 16 + ['D'] * 14 + ['E'] * 12 + ['F'] * 12
		b = ['A'] * 16 + ['B'] * 16 + ['C'] * 16 + ['D'] * 16 + ['E'] * 16 + ['F'] * 8
		# Computation of chi-square p-value (a is shortened)
		self.assertAlmostEqual(statx.chi_square(a, b), statx.chi_square(b, a))
		# Computation of chi-square p-value (b is shortened)
		aa = statx.chi_square(b[0:(-8)], a)
		bb = statx.chi_square(a, b[0:(-8)])
		self.assertAlmostEqual(aa[0], bb[0])  # p-value
		self.assertAlmostEqual(aa[1], bb[1])  # chi-square value
Exemplo n.º 4
0
    def run_goodness_of_fit_test(self, observed_freqs, expected_freqs, alpha=0.01, min_counts=5):
        """ Checks the validity of observed and expected counts and runs chi-square test for goodness of fit.

        :param observed_freqs: observed frequencies
        :type  observed_freqs: pd.Series
        :param expected_freqs: expected frequencies
        :type  expected_freqs: pd.Series
        :param alpha: significance level
        :type  alpha: float
        :param min_counts: minimum number of observations to run chi-square test
        :type  min_counts: int
        :return split_is_unbiased: False is split is biased and True if split is correct
                p_value: corresponding chi-square p-value
        :rtype: bool, float
        """

        if not isinstance(observed_freqs, pd.Series) or not isinstance(expected_freqs, pd.Series):
            raise ValueError("Observed and expected frequencies should be of type Series.")
        if observed_freqs.empty or expected_freqs.empty:
            raise ValueError("Variant split check was cancelled since expected or observed frequencies are empty.")

        # Ensure at least a frequency of min_counts at every location in observed_counts.
        # It's recommended to not conduct test if frequencies in each category is less than min_counts
        valid_observed_freqs = observed_freqs[observed_freqs > min_counts]
        valid_expected_freqs = expected_freqs.filter(valid_observed_freqs.keys())

        if len(valid_observed_freqs) == len(valid_expected_freqs) and len(valid_observed_freqs) >= 2:
            _, p_value = statx.chi_square(valid_observed_freqs.sort_index(), valid_expected_freqs.sort_index())
            split_is_unbiased = p_value >= alpha
        else:
            raise ValueError("Variant split check was cancelled since observed or expected frequencies "
                             "are less than 2.")
        return split_is_unbiased, p_value
Exemplo n.º 5
0
	def test__chi_square__computation_same_data(self):
		"""
    	Check if p-value is 1.0 for same data entered twice.
    	"""
		# Computation of chi-square p-value
		self.assertEqual(1.0,
						 statx.chi_square(self.samples.temperature,
										  self.samples.temperature)[0])
Exemplo n.º 6
0
	def test__chi_square__computation_different_data(self):
		"""
    	Check if p-value is correct for test data from pandas manual page.
    	"""
		# Create test data:
		a = ['A'] * 16 + ['B'] * 18 + ['C'] * 16 + ['D'] * 14 + ['E'] * 12 + ['F'] * 12
		b = ['A'] * 16 + ['B'] * 16 + ['C'] * 16 + ['D'] * 16 + ['E'] * 16 + ['F'] * 8
		# Computation of chi-square p-value
		self.assertAlmostEqual(0.89852623940266074, statx.chi_square(a, b)[0])
Exemplo n.º 7
0
	def test__chi_square__computation_different_data_as_in_statistics_book(self):
		"""
    	Check if p-value is correct for test data from statistics book
    	Fahrmeir et al. (2007) pp. 463.
    	"""
		# Create test data:
		a = ['nein'] * 139 + ['gut'] * 348 + ['mittel'] * 213
		b = ['nein'] * 135 + ['gut'] * 46 + ['mittel'] * 119
		# Computation of chi-square p-value
		p, chisq, nattr = statx.chi_square(a, b)
		self.assertAlmostEqual(116.851, chisq, delta=0.001)
		self.assertAlmostEqual(0.0, p, delta=0.00000000001)
Exemplo n.º 8
0
	def test__chi_square__computation_different_data_as_in_open_statistics_book(self):
		"""
    	Check if p-value is correct for test data from
    	open statistics book 3rd ed pp. 299.
    	(https://www.openintro.org/stat/textbook.php)
    	"""
		# Create test data:
		a = ['cu'] * 3511 + ['t1'] * 1749 + ['t2'] * 1818
		b = ['cu'] * 1489 + ['t1'] * 751 + ['t2'] * 682
		# Computation of chi-square p-value
		p, chisq, nattr = statx.chi_square(a, b)
		self.assertAlmostEqual(6.120, chisq, delta=0.001)
		self.assertAlmostEqual(0.0469, p, delta=0.0001)
Exemplo n.º 9
0
    def run_goodness_of_fit_test(self,
                                 observed_freqs,
                                 expected_freqs,
                                 alpha=0.01,
                                 min_counts=5):
        """ Checks the validity of observed and expected counts and runs chi-square test for goodness of fit.

        :param observed_freqs: observed frequencies
        :type  observed_freqs: pd.Series
        :param expected_freqs: expected frequencies
        :type  expected_freqs: pd.Series
        :param alpha: significance level
        :type  alpha: float
        :param min_counts: minimum number of observations to run chi-square test
        :type  min_counts: int
        :return split_is_unbiased: False is split is biased and True if split is correct
                p_value: corresponding chi-square p-value
        :rtype: bool, float
        """

        if not isinstance(observed_freqs, pd.Series) or not isinstance(
                expected_freqs, pd.Series):
            raise ValueError(
                "Observed and expected frequencies should be of type Series.")
        if observed_freqs.empty or expected_freqs.empty:
            raise ValueError(
                "Variant split check was cancelled since expected or observed frequencies are empty."
            )

        # Ensure at least a frequency of min_counts at every location in observed_counts.
        # It's recommended to not conduct test if frequencies in each category is less than min_counts
        valid_observed_freqs = observed_freqs[observed_freqs > min_counts]
        valid_expected_freqs = expected_freqs.filter(
            valid_observed_freqs.keys())

        if len(valid_observed_freqs) == len(
                valid_expected_freqs) and len(valid_observed_freqs) >= 2:
            _, p_value = statx.chi_square(valid_observed_freqs.sort_index(),
                                          valid_expected_freqs.sort_index())
            split_is_unbiased = p_value >= alpha
        else:
            raise ValueError(
                "Variant split check was cancelled since observed or expected frequencies "
                "are less than 2.")
        return split_is_unbiased, p_value
Exemplo n.º 10
0
	def do_delta_categorical(df):
		pval = statx.chi_square(x=df.iloc[:, 2], y=baseline_metric)[0]
		ss_x = statx.sample_size(df.iloc[:, 2])
		return feature_check_to_dataframe(metric=df.columns[2],
										  samplesize_variant=ss_x,
										  pval=pval)
Exemplo n.º 11
0
    def chi_square_test_result_and_statistics(self,
                                              variant_column,
                                              weights,
                                              min_counts=5,
                                              alpha=0.05):
        """ Tests the consistency of variant split with the hypothesized distribution.
        
        :param variant_column: variant column from the input data frame
        :param weights: dict with variant names as keys, weights as values
                        ({<variant_name>:<weight>, ...}
        :param min_counts: minimum number of observed and expected frequencies (should be at least 5), see 
                            http://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.stats.chisquare.html
        :param alpha: significance level, 0.05 by default
        :return: True(if split is consistent with the given split) or
                 False(if split is not consistent with the given split)
        :rtype:  Boolean, float, float
        """
        if not hasattr(variant_column, '__len__'):
            raise ValueError(
                "Variant split check was cancelled since input variant column is empty or doesn't exist."
            )
        if not hasattr(weights, '__len__'):
            raise ValueError(
                "Variant split check was cancelled since input weights are empty or doesn't exist."
            )
        if len(weights) <= 1 or len(variant_column) <= 1:
            raise ValueError(
                "Variant split check was cancelled since input weights or the number if categories "
                "is less than 2.")

        # Count number of observations per each variant
        variant_column = pd.Series(variant_column).dropna(axis=0)
        observed_freqs = variant_column.value_counts()

        # Ensure at least a frequency of min_counts at every location in observed_counts.
        # It's recommended to not conduct test if frequencies in each category is less than min_counts
        if len(observed_freqs[observed_freqs < min_counts]) >= 1:
            raise ValueError(
                "Chi-square test is not valid for small expected or observed frequencies."
            )

        # If there are less than 2 categories left after dropping counts less than 5 we can't conduct the test.
        if len(observed_freqs) < 2:
            raise ValueError(
                "If the number of categories is less than 2 Chi-square test is not applicable."
            )

        # Calculate expected counts given corresponding weights,
        # weights are filtered out of categories which were dropped before.
        total_count = observed_freqs.sum()
        weights = {
            k: v
            for (k, v) in weights.items() if k in observed_freqs.index.values
        }
        expected_freqs = pd.Series(weights)
        expected_freqs *= total_count

        # Compute chi-square and p-value statistics
        chi_square_val, p_val = statx.chi_square(observed_freqs.sort_index(),
                                                 expected_freqs.sort_index())

        return p_val >= alpha, p_val, chi_square_val