def chi_square_test(input1, input2): truth1 = [0.5 for e in input1] truth2 = [0.5 for e in input2] print('---') print(stats.chisquare(input1, truth1)) print(stats.chisquare(input2, truth2)) print('---')
def describe_date_1d(series: pd.Series, series_description: dict) -> dict: """Describe a date series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ stats = {"min": series.min(), "max": series.max(), "histogram_data": series} bins = config["plot"]["histogram"]["bins"].get(int) # Bins should never be larger than the number of distinct values bins = min(series_description["distinct_count_with_nan"], bins) stats["histogram_bins"] = bins stats["range"] = stats["max"] - stats["min"] chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(float) if chi_squared_threshold > 0.0: histogram = np.histogram( series[series.notna()].astype("int64").values, bins="auto" )[0] stats["chi_squared"] = chisquare(histogram) return stats
def describe_categorical_1d(series: pd.Series, series_description: dict) -> dict: """Describe a categorical series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # Make sure we deal with strings (Issue #100) series = series.astype(str) # Only run if at least 1 non-missing value value_counts = series_description["value_counts_without_nan"] stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]} redact = config["vars"]["cat"]["redact"].get(float) if not redact: stats.update({"first_rows": series.head(5)}) stats.update( histogram_compute(value_counts, len(value_counts), name="histogram_frequencies")) chi_squared_threshold = config["vars"]["num"][ "chi_squared_threshold"].get(float) if chi_squared_threshold > 0.0: stats["chi_squared"] = list(chisquare(value_counts.values)) check_length = config["vars"]["cat"]["length"].get(bool) if check_length: stats.update(length_summary(series)) stats.update( histogram_compute(stats["length"], stats["length"].nunique(), name="histogram_length")) check_unicode = config["vars"]["cat"]["characters"].get(bool) if check_unicode: stats.update(unicode_summary(series)) stats["n_characters_distinct"] = stats["n_characters"] stats["n_characters"] = stats["character_counts"].values.sum() stats["category_alias_counts"].index = stats[ "category_alias_counts"].index.str.replace("_", " ") words = config["vars"]["cat"]["words"] if words: stats.update(word_summary(series)) coerce_str_to_date = config["vars"]["cat"]["coerce_str_to_date"].get( bool) if coerce_str_to_date: stats["date_warning"] = warning_type_date(series) return stats
def chi_square(dictionary, matrix, neg_num=0, pos_num=0): """ 计算各个特征的卡方值,a,b,c,d分别为观测值,A,B,C,D为预测值,这里因为 采用的训练语料是非平衡的,比例为3:7,因此A=(a+b)*.7,B=(a+b)*.3,以此类推 正 负 包含x1 a b 不包含x1 c d 通常用计算出的卡方值筛选特征 :param dictionary: 字典 :param matrix: 文本的频率矩阵 :param neg_num: 负文本的数量 :param pos_num: 正文本的数量 :return:一个一维数组,包含每个特征X的卡方值,p值越小说明越有区分力,应当选取 """ chi_squares = [] As = [] Ts = [] for i in range(0, len(dictionary)): a = 0 b = 0 for j in range(0, len(matrix)): if matrix[j][i] > 0 and j < neg_num: b += 1 if matrix[j][i] > 0 and j >= neg_num: a += 1 c = pos_num - a + 0.01 d = neg_num - b + 0.01 A = [a, b, c, d] T = [(a + b) * 0.7, (a + b) * 0.3, (c + d) * 0.7, (c + d) * 0.3] As.append(A) Ts.append(T) chi_squares.append(stats.chisquare(A, f_exp=T)[1]) print(As) print(Ts) return chi_squares
def describe_date_1d(series: pd.Series, series_description: dict) -> dict: """Describe a date series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ stats = { "min": pd.Timestamp.to_pydatetime(series.min()), "max": pd.Timestamp.to_pydatetime(series.max()), } stats["range"] = stats["max"] - stats["min"] values = series[series.notnull()].values.astype(np.int64) // 10**9 chi_squared_threshold = config["vars"]["num"][ "chi_squared_threshold"].get(float) if chi_squared_threshold > 0.0: histogram, _ = np.histogram(values, bins="auto") stats["chi_squared"] = chisquare(histogram) stats.update( histogram_compute(values, series_description["n_distinct"])) return stats
def describe_categorical_1d(series: pd.Series, series_description: dict) -> dict: """Describe a categorical series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # Make sure we deal with strings (Issue #100) series = series.astype(str) # Only run if at least 1 non-missing value value_counts = series_description["value_counts_without_nan"] stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]} chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get( float) if chi_squared_threshold > 0.0: stats["chi_squared"] = list(chisquare(value_counts.values)) check_composition = config["vars"]["cat"]["check_composition"].get(bool) if check_composition: from visions.application.summaries.series.text_summary import text_summary stats.update(text_summary(series)) stats["length"] = series.str.len() stats["date_warning"] = warning_type_date(series) return stats
def find_not_link_loci(self, N, E, sigValue): ''' Determines and returns a list of pairs of loci that are self.not in linkage equilibrium. ''' not_link_loci = [] k = 0 s = 1 ijk_count = 0 rts_count = 0 while k < (self.m//2 - 1): s = k + 1 namek = self.loci[k * 2] ijk_obs = [] ijk_exp = [] for ival, jval in N[namek]: if E[namek][(ival, jval)] != 0: #print P[k][i][j] ijk_exp.append(E[namek][(ival,jval)]) ijk_obs.append(N[namek][(ival,jval)]) ijk_count = ijk_count + 1 while s < self.m//2: names = self.loci[s * 2] rts_obs = [] rts_exp = [] for rval, tval in N[names]: if E[names][(rval, tval)] != 0: rts_exp.append(E[names][(rval, tval)]) rts_obs.append(N[names][(rval, tval)]) trs_count = rts_count + 1 LK_ijk = len(self.alinlocus[namek]) LK_rts = len(self.alinlocus[names]) ddof_ijk = 0.5 * LK_ijk * (LK_ijk - 1) ddof_rts = 0.5 * LK_rts * (LK_rts - 1) ddof = (ddof_ijk - 1) * (ddof_rts - 1) tmp = 0 obs = [] exp = [] # wow i put or instead of and and i must have forgot how to program last night while tmp < len(ijk_obs) and tmp < len(rts_obs): obs.append(ijk_obs[tmp] * rts_obs[tmp]) exp.append(ijk_exp[tmp] * rts_exp[tmp]) tmp = tmp + 1 if tmp < len(ijk_obs): obs.append(ijk_obs[tmp]) exp.append(ijk_exp[tmp]) tmp = tmp + 1 elif tmp < len(rts_obs): obs.append(rts_obs[tmp]) exp.append(rts_exp[tmp]) tmp = tmp + 1 chisq, p = chisquare(obs, exp, ddof) if p < sigValue: not_link_loci.append((namek, names)) s = s + 1 k = k + 1 return not_link_loci
def optimiseP(self,loP=-0.005,hiP=0.005,Pstep=0.000005): periods = numpy.arange(loP,hiP,float(Pstep)) pcurve = numpy.empty_like(periods) for jj,p in enumerate(periods): nTimePhase = numpy.empty_like(self.timePhase) delays = self.getPdelays(p) for ii in range(self.nints): nTimePhase[ii] = rollArray(self.timePhase[ii],delays[ii],0) pcurve[jj] = chisquare(nTimePhase.sum(axis=0))[0] return pcurve
def optimiseDM(self,hidm=50,lodm=-50,dmstep=1): dms = numpy.arange(lodm,hidm,float(dmstep)) dmcurve = numpy.empty_like(dms) for jj,dm in enumerate(dms): nFreqPhase = numpy.empty_like(self.freqPhase) delays = self.getDMdelays(dm) for ii in range(self.nbands): nFreqPhase[ii] = rollArray(self.freqPhase[ii],delays[ii],0) dmcurve[jj] = chisquare(nFreqPhase.sum(axis=0))[0] return dmcurve
def describe_categorical_1d(series: pd.Series, series_description: dict) -> dict: """Describe a categorical series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # Make sure we deal with strings (Issue #100) series = series.astype(str) # Only run if at least 1 non-missing value value_counts = series_description["value_counts_without_nan"] stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]} chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get( float) if chi_squared_threshold > 0.0: stats["chi_squared"] = list(chisquare(value_counts.values)) check_composition = config["vars"]["cat"]["check_composition"].get(bool) if check_composition: contains = { "chars": series.str.contains(r"[a-zA-Z]", case=False, regex=True).any(), "digits": series.str.contains(r"[0-9]", case=False, regex=True).any(), "spaces": series.str.contains(r"\s", case=False, regex=True).any(), "non-words": series.str.contains(r"\W", case=False, regex=True).any(), } stats["length"] = series.str.len() stats["max_length"] = series.str.len().max() stats["mean_length"] = series.str.len().mean() stats["min_length"] = series.str.len().min() stats["composition"] = contains stats["date_warning"] = warning_type_date(series) return stats
def find_not_hwe_loci(self, N, E, sigValue): ''' Determines and returns list of loci that are self.not in HWE. ''' not_hwe_loci = [] for k in range(0, self.m//2): obs = [] exp = [] namek = self.loci[k*2] for ival, jval in N[namek]: if (E[namek][(ival,jval)] != 0): #print P[k][i][j] exp.append(E[namek][(ival,jval)]) obs.append(N[namek][(ival,jval)]) Lk = len(self.alinlocus[namek]) ddof = 0.5 * Lk *(Lk-1) chisq, p = chisquare(obs, exp, ddof) if p < sigValue: not_hwe_loci.append(namek) return not_hwe_loci
def describe_numeric_1d(series: pd.Series, series_description: dict) -> dict: """Describe a numeric series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. Notes: When 'bins_type' is set to 'bayesian_blocks', astropy.stats.bayesian_blocks is used to determine the number of bins. Read the docs: https://docs.astropy.org/en/stable/visualization/histogram.html https://docs.astropy.org/en/stable/api/astropy.stats.bayesian_blocks.html This method might print warnings, which we suppress. https://github.com/astropy/astropy/issues/4927 """ def mad(arr): """Median Absolute Deviation: a "Robust" version of standard deviation. Indices variability of the sample. https://en.wikipedia.org/wiki/Median_absolute_deviation """ return np.median(np.abs(arr - np.median(arr))) quantiles = config["vars"]["num"]["quantiles"].get(list) n_infinite = ((series == np.inf) | (series == -np.inf)).sum() if isinstance(series.dtype, _IntegerDtype): stats = numeric_stats_pandas(series) present_values = series.loc[series.notnull()].astype( str(series.dtype).lower()) stats["n_zeros"] = series_description["count"] - np.count_nonzero( present_values) stats["histogram_data"] = present_values finite_values = present_values else: values = series.values present_values = values[~np.isnan(values)] finite_values = values[np.isfinite(values)] stats = numeric_stats_numpy(present_values) stats["histogram_data"] = finite_values stats.update({ "mad": mad(present_values), "scatter_data": series, # For complex "p_infinite": n_infinite / series_description["n"], "n_infinite": n_infinite, }) chi_squared_threshold = config["vars"]["num"][ "chi_squared_threshold"].get(float) if chi_squared_threshold > 0.0: histogram, _ = np.histogram(finite_values, bins="auto") stats["chi_squared"] = chisquare(histogram) stats["range"] = stats["max"] - stats["min"] stats.update({ f"{percentile:.0%}": value for percentile, value in series.quantile( quantiles).to_dict().items() }) stats["iqr"] = stats["75%"] - stats["25%"] stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN stats["p_zeros"] = stats["n_zeros"] / series_description["n"] stats["monotonic_increase"] = series.is_monotonic_increasing stats["monotonic_decrease"] = series.is_monotonic_decreasing stats["monotonic_increase_strict"] = (stats["monotonic_increase"] and series.is_unique) stats["monotonic_decrease_strict"] = (stats["monotonic_decrease"] and series.is_unique) stats.update( histogram_compute(finite_values, series_description["n_distinct"])) return stats
def chi_squared_test(actual, expected): chi2_stat, p = stats.chisquare(actual, expected) return chi2_stat, p
def chi_square(values=None, histogram=None): if histogram is None: histogram, _ = np.histogram(values, bins="auto") return dict(chisquare(histogram)._asdict())
def def_chisquare(f_obs1, f_exp1=None): res = chisquare(f_obs=f_obs1, f_exp=f_exp1, axis=None) return res
def chi_square( values: Optional[np.ndarray] = None, histogram: Optional[np.ndarray] = None ) -> dict: if histogram is None: histogram, _ = np.histogram(values, bins="auto") return dict(chisquare(histogram)._asdict())
def describe_numeric_1d(series: pd.Series, series_description: dict) -> dict: """Describe a numeric series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. Notes: When 'bins_type' is set to 'bayesian_blocks', astropy.stats.bayesian_blocks is used to determine the number of bins. Read the docs: https://docs.astropy.org/en/stable/visualization/histogram.html https://docs.astropy.org/en/stable/api/astropy.stats.bayesian_blocks.html This method might print warnings, which we suppress. https://github.com/astropy/astropy/issues/4927 """ def mad(arr): """ Median Absolute Deviation: a "Robust" version of standard deviation. Indices variability of the sample. https://en.wikipedia.org/wiki/Median_absolute_deviation """ return np.median(np.abs(arr - np.median(arr))) quantiles = config["vars"]["num"]["quantiles"].get(list) n_infinite = ((series == np.inf) | (series == -np.inf)).sum() values = series.values present_values = values[~np.isnan(values)] finite_values = values[np.isfinite(values)] stats = { "mean": np.mean(present_values), "std": np.std(present_values, ddof=1), "variance": np.var(present_values, ddof=1), "min": np.min(present_values), "max": np.max(present_values), # Unbiased kurtosis obtained using Fisher's definition (kurtosis of normal == 0.0). Normalized by N-1. "kurtosis": series.kurt(), # Unbiased skew normalized by N-1 "skewness": series.skew(), "sum": np.sum(present_values), "mad": mad(present_values), "n_zeros": (series_description["count"] - np.count_nonzero(present_values)), "histogram_data": finite_values, "scatter_data": series, # For complex "p_infinite": n_infinite / series_description["n"], "n_infinite": n_infinite, } chi_squared_threshold = config["vars"]["num"][ "chi_squared_threshold"].get(float) if chi_squared_threshold > 0.0: histogram, _ = np.histogram(finite_values, bins="auto") stats["chi_squared"] = chisquare(histogram) stats["range"] = stats["max"] - stats["min"] stats.update({ f"{percentile:.0%}": value for percentile, value in series.quantile( quantiles).to_dict().items() }) stats["iqr"] = stats["75%"] - stats["25%"] stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN stats["p_zeros"] = stats["n_zeros"] / series_description["n"] bins = config["plot"]["histogram"]["bins"].get(int) # Bins should never be larger than the number of distinct values bins = min(series_description["distinct_count_with_nan"], bins) stats["histogram_bins"] = bins bayesian_blocks_bins = config["plot"]["histogram"][ "bayesian_blocks_bins"].get(bool) if bayesian_blocks_bins: from astropy.stats import bayesian_blocks with warnings.catch_warnings(): warnings.simplefilter("ignore") ret = bayesian_blocks(stats["histogram_data"]) # Sanity check if not np.isnan(ret).any() and ret.size > 1: stats["histogram_bins_bayesian_blocks"] = ret return stats
def describe_numeric_1d(series: pd.Series, series_description: dict) -> dict: """Describe a numeric series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. Notes: When 'bins_type' is set to 'bayesian_blocks', astropy.stats.bayesian_blocks is used to determine the number of bins. Read the docs: https://docs.astropy.org/en/stable/visualization/histogram.html https://docs.astropy.org/en/stable/api/astropy.stats.bayesian_blocks.html This method might print warnings, which we suppress. https://github.com/astropy/astropy/issues/4927 """ quantiles = config["vars"]["num"]["quantiles"].get(list) stats = { "mean": series.mean(), "std": series.std(), "variance": series.var(), "min": series.min(), "max": series.max(), "kurtosis": series.kurt(), "skewness": series.skew(), "sum": series.sum(), "mad": series.mad(), "n_zeros": (len(series) - np.count_nonzero(series)), "histogram_data": series, "scatter_data": series, # For complex } chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(float) if chi_squared_threshold > 0.0: histogram = np.histogram(series[series.notna()].values, bins="auto")[0] stats["chi_squared"] = chisquare(histogram) stats["range"] = stats["max"] - stats["min"] stats.update( { f"{percentile:.0%}": value for percentile, value in series.quantile(quantiles).to_dict().items() } ) stats["iqr"] = stats["75%"] - stats["25%"] stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN stats["p_zeros"] = float(stats["n_zeros"]) / len(series) bins = config["plot"]["histogram"]["bins"].get(int) # Bins should never be larger than the number of distinct values bins = min(series_description["distinct_count_with_nan"], bins) stats["histogram_bins"] = bins bayesian_blocks_bins = config["plot"]["histogram"]["bayesian_blocks_bins"].get(bool) if bayesian_blocks_bins: with warnings.catch_warnings(): warnings.simplefilter("ignore") ret = bayesian_blocks(stats["histogram_data"]) # Sanity check if not np.isnan(ret).any() and ret.size > 1: stats["histogram_bins_bayesian_blocks"] = ret return stats
significant_comparisons = [] for systemA_num, resultsA in enumerate(results[:-1]): systemA = "%s/%s" % (model_types[systemA_num], model_names[systemA_num]) for systemB_num, resultsB in enumerate(results[systemA_num + 1:], start=systemA_num + 1): systemB = "%s/%s" % (model_types[systemB_num], model_names[systemB_num]) print("Comparing %s to system %s" % (systemA, systemB)) # Output accuracy scores accuracyA = 100. * resultsA[1] / resultsA.sum() accuracyB = 100. * resultsB[1] / resultsB.sum() print(" %s: %.2f%%" % (systemA, accuracyA)) print(" %s: %.2f%%" % (systemB, accuracyB)) # Compute significance with chi-squared test chi2_stat, p = chisquare(resultsA, resultsB) print(" p=%g %s" % (p, "***" if p < 0.01 else "**" if p < 0.05 else "")) if p < 0.05: significant_comparisons.append({ "A": systemA, "B": systemB, "p": p, "A higher": (accuracyA > accuracyB), }) if significant_comparisons: