def summary_df(self, thresholds=None, lower_quantile=None, upper_quantile=None): """ Calculates the pair of metrics for each threshold for each result. """ if thresholds is None: thresholds = self.thresholds if lower_quantile is None: lower_quantile = self.config['lower_quantile'] if upper_quantile is None: upper_quantile = self.config['upper_quantile'] if self.n_current_results > self.n_cached_curves: # If there are new curves, recompute colnames = ['_'.join([metric, stat]) for metric in [self.metric1.name, self.metric2.name] for stat in ['Mean', 'Median', '%d_Percentile' % (100*lower_quantile), '%d_Percentile' % (upper_quantile*100)]] self.ret = pd.DataFrame(columns=colnames, index=thresholds, dtype='float64') for threshold in thresholds: m1s = Series([self.metric1.score(result, threshold) for result in self.results]) m2s = Series([self.metric2.score(result, threshold) for result in self.results]) self.ret.loc[threshold] = (m1s.mean(), m1s.quantile(.5), m1s.quantile(.05), m1s.quantile(.95), m2s.mean(), m2s.quantile(.5), m2s.quantile(.05), m2s.quantile(.95)) return self.ret
def get_cutoffs(x, num_groups=10): """Get the cutoffs that splits `x` into `num_groups` equally sized groups.""" series = Series(x) cutoffs = [] for i in range(num_groups): perc_low = float(i) / num_groups perc_high = float(i + 1) / num_groups cutoffs.append((series.quantile(perc_low), series.quantile(perc_high))) return cutoffs
def summary_df(self): lower_quantile = self.config['lower_quantile'] upper_quantile = self.config['upper_quantile'] vals = Series(self.summary) lower_bound = vals.quantile(lower_quantile) upper_bound = vals.quantile(upper_quantile) median = vals.quantile(0.5) mean = vals.mean() column_names = [ "Mean" , "Median" , "%d_Percentile" % (lower_quantile*100), "%d_Percentile" % (upper_quantile*100)] df = pd.DataFrame(dict(zip(column_names, [mean, median, lower_bound, upper_bound])), index=[0]) return df
def test_quantile_empty(self): # floats s = Series([], dtype='float64') res = s.quantile(0.5) self.assertTrue(np.isnan(res)) res = s.quantile([0.5]) exp = Series([np.nan], index=[0.5]) tm.assert_series_equal(res, exp) # int s = Series([], dtype='int64') res = s.quantile(0.5) self.assertTrue(np.isnan(res)) res = s.quantile([0.5]) exp = Series([np.nan], index=[0.5]) tm.assert_series_equal(res, exp) # datetime s = Series([], dtype='datetime64[ns]') res = s.quantile(0.5) self.assertTrue(res is pd.NaT) res = s.quantile([0.5]) exp = Series([pd.NaT], index=[0.5]) tm.assert_series_equal(res, exp)
def describe_numeric_1d(series: pd.Series, series_description: dict) -> dict: """Describe a numeric series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. Notes: When 'bins_type' is set to 'bayesian_blocks', astropy.stats.bayesian_blocks is used to determine the number of bins. Read the docs: https://docs.astropy.org/en/stable/visualization/histogram.html https://docs.astropy.org/en/stable/api/astropy.stats.bayesian_blocks.html This method might print warnings, which we suppress. https://github.com/astropy/astropy/issues/4927 """ def mad(arr): """ Median Absolute Deviation: a "Robust" version of standard deviation. Indices variability of the sample. https://en.wikipedia.org/wiki/Median_absolute_deviation """ return np.median(np.abs(arr - np.median(arr))) quantiles = config["vars"]["num"]["quantiles"].get(list) n_infinite = ((series == np.inf) | (series == -np.inf)).sum() if isinstance(series.dtype, _IntegerDtype): stats = numeric_stats_pandas(series) present_values = series.loc[series.notnull()].astype( str(series.dtype).lower()) stats["n_zeros"] = series_description["count"] - np.count_nonzero( present_values) stats["histogram_data"] = present_values finite_values = present_values else: values = series.values present_values = values[~np.isnan(values)] finite_values = values[np.isfinite(values)] stats = numeric_stats_numpy(present_values) stats["histogram_data"] = finite_values stats.update({ "mad": mad(present_values), "scatter_data": series, # For complex "p_infinite": n_infinite / series_description["n"], "n_infinite": n_infinite, }) chi_squared_threshold = config["vars"]["num"][ "chi_squared_threshold"].get(float) if chi_squared_threshold > 0.0: histogram, _ = np.histogram(finite_values, bins="auto") stats["chi_squared"] = chisquare(histogram) stats["range"] = stats["max"] - stats["min"] stats.update({ f"{percentile:.0%}": value for percentile, value in series.quantile( quantiles).to_dict().items() }) stats["iqr"] = stats["75%"] - stats["25%"] stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN stats["p_zeros"] = stats["n_zeros"] / series_description["n"] stats["monotonic_increase"] = series.is_monotonic_increasing stats["monotonic_decrease"] = series.is_monotonic_decreasing stats["monotonic_increase_strict"] = (stats["monotonic_increase"] and series.is_unique) stats["monotonic_decrease_strict"] = (stats["monotonic_decrease"] and series.is_unique) stats.update( histogram_compute(finite_values, series_description["n_unique"])) return stats
def get(self, data: pd.Series) -> List[float]: return data.quantile((np.arange(self.bins - 1) + 1) / self.bins).drop_duplicates().tolist()
def describe_numeric_1d(series: pd.Series, series_description: dict) -> dict: """Describe a numeric series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. Notes: When 'bins_type' is set to 'bayesian_blocks', astropy.stats.bayesian_blocks is used to determine the number of bins. Read the docs: https://docs.astropy.org/en/stable/visualization/histogram.html https://docs.astropy.org/en/stable/api/astropy.stats.bayesian_blocks.html This method might print warnings, which we suppress. https://github.com/astropy/astropy/issues/4927 """ quantiles = config["vars"]["num"]["quantiles"].get(list) stats = { "mean": series.mean(), "std": series.std(), "variance": series.var(), "min": series.min(), "max": series.max(), "kurtosis": series.kurt(), "skewness": series.skew(), "sum": series.sum(), "mad": series.mad(), "n_zeros": (len(series) - np.count_nonzero(series)), "histogramdata": series, } stats["range"] = stats["max"] - stats["min"] stats.update({ "{:.0%}".format(percentile): value for percentile, value in series.quantile(quantiles).to_dict().items() }) stats["iqr"] = stats["75%"] - stats["25%"] stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN stats["p_zeros"] = float(stats["n_zeros"]) / len(series) bins = config["plot"]["histogram"]["bins"].get(int) # Bins should never be larger than the number of distinct values bins = min(series_description["distinct_count_with_nan"], bins) stats["histogram_bins"] = bins bayesian_blocks_bins = config["plot"]["histogram"][ "bayesian_blocks_bins"].get(bool) if bayesian_blocks_bins: with warnings.catch_warnings(): warnings.simplefilter("ignore") ret = bayesian_blocks(stats["histogramdata"]) # Sanity check if not np.isnan(ret).any() and ret.size > 1: stats["histogram_bins_bayesian_blocks"] = ret return stats
import pandas as pd from pandas import Series fig = plt.figure() ax = fig.add_subplot(211) data = pd.read_csv('time.txt', sep='\n', header=None) data = Series(data[0]) data = data[data < 300] data = data.sort_values() #data.to_csv('zz.txt', index=False) t = data.values ax.hist(x=t, bins=100, normed=True) l = float(1) / data.mean() p = l * np.power(math.e, -l * t) ax.plot(t, p) print data.describe() fig.show() x1 = data.quantile(0.25) x3 = data.quantile(0.75) print '中间1/2的均值: ', data[data > x1][data < x3].mean()
def numerical_summary( series: pd.Series, quantiles=(0.05, 0.25, 0.5, 0.75, 0.95), count=None, is_unique=None, return_values=False, ) -> Union[dict, Tuple[dict, Any]]: """ Args: series: series to summarize Returns: """ if count is None: count = series.count() values = series.values present_values = values[~np.isnan(values)] finite_mask = np.isfinite(present_values) finite_values = present_values[finite_mask] summary = { "mean": np.mean(present_values), "std": np.std(present_values, ddof=1), "min": np.min(present_values), "max": np.max(present_values), # Unbiased kurtosis obtained using Fisher's definition (kurtosis of normal == 0.0). Normalized by N-1. "kurt": series.kurt(), # Unbiased skew normalized by N-1 "skew": series.skew(), "sum": np.sum(present_values), "n_infinite": (~finite_mask).sum(), "n_zeros": (count - np.count_nonzero(present_values)), } for percentile, value in series.quantile(quantiles).to_dict().items(): summary["quantile_{:d}".format(int(percentile * 100))] = value summary["median"] = summary["quantile_50"] summary["iqr"] = summary["quantile_75"] - summary["quantile_25"] summary["mad"] = mad(present_values, summary["quantile_50"]) summary["variance"] = summary["std"] ** 2 summary["cv"] = summary["std"] / summary["mean"] if summary["mean"] else np.NaN summary["range"] = summary["max"] - summary["min"] summary["monotonic_increase"] = series.is_monotonic_increasing summary["monotonic_decrease"] = series.is_monotonic_decreasing summary["monotonic_increase_strict"] = ( summary["monotonic_increase"] and series.is_unique ) summary["monotonic_decrease_strict"] = ( summary["monotonic_decrease"] and series.is_unique ) if return_values: return summary, finite_values return summary
def test_quantile_sparse(self, values, dtype): ser = Series(values, dtype=dtype) result = ser.quantile([0.5]) expected = Series(np.asarray(ser)).quantile([0.5]) tm.assert_series_equal(result, expected)
def describe_numeric_1d(series: pd.Series, series_description: dict) -> dict: """Describe a numeric series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. Notes: When 'bins_type' is set to 'bayesian_blocks', astropy.stats.bayesian_blocks is used to determine the number of bins. Read the docs: https://docs.astropy.org/en/stable/visualization/histogram.html https://docs.astropy.org/en/stable/api/astropy.stats.bayesian_blocks.html This method might print warnings, which we suppress. https://github.com/astropy/astropy/issues/4927 """ def mad(arr): """ Median Absolute Deviation: a "Robust" version of standard deviation. Indices variability of the sample. https://en.wikipedia.org/wiki/Median_absolute_deviation """ return np.median(np.abs(arr - np.median(arr))) quantiles = config["vars"]["num"]["quantiles"].get(list) n_infinite = ((series == np.inf) | (series == -np.inf)).sum() values = series.values present_values = values[~np.isnan(values)] finite_values = values[np.isfinite(values)] stats = { "mean": np.mean(present_values), "std": np.std(present_values, ddof=1), "variance": np.var(present_values, ddof=1), "min": np.min(present_values), "max": np.max(present_values), # Unbiased kurtosis obtained using Fisher's definition (kurtosis of normal == 0.0). Normalized by N-1. "kurtosis": series.kurt(), # Unbiased skew normalized by N-1 "skewness": series.skew(), "sum": np.sum(present_values), "mad": mad(present_values), "n_zeros": (series_description["count"] - np.count_nonzero(present_values)), "histogram_data": finite_values, "scatter_data": series, # For complex "p_infinite": n_infinite / series_description["n"], "n_infinite": n_infinite, } chi_squared_threshold = config["vars"]["num"][ "chi_squared_threshold"].get(float) if chi_squared_threshold > 0.0: histogram, _ = np.histogram(finite_values, bins="auto") stats["chi_squared"] = chisquare(histogram) stats["range"] = stats["max"] - stats["min"] stats.update({ f"{percentile:.0%}": value for percentile, value in series.quantile( quantiles).to_dict().items() }) stats["iqr"] = stats["75%"] - stats["25%"] stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN stats["p_zeros"] = stats["n_zeros"] / series_description["n"] bins = config["plot"]["histogram"]["bins"].get(int) # Bins should never be larger than the number of distinct values bins = min(series_description["distinct_count_with_nan"], bins) stats["histogram_bins"] = bins bayesian_blocks_bins = config["plot"]["histogram"][ "bayesian_blocks_bins"].get(bool) if bayesian_blocks_bins: from astropy.stats import bayesian_blocks with warnings.catch_warnings(): warnings.simplefilter("ignore") ret = bayesian_blocks(stats["histogram_data"]) # Sanity check if not np.isnan(ret).any() and ret.size > 1: stats["histogram_bins_bayesian_blocks"] = ret return stats
lines = f.readlines() trigrams = {} for line in lines: trigram = line.strip().lower()[0:3] if len(trigram) >= 3 and not nonalphabet.search(trigram): if trigram == "aaa": print "line: {0} trigram: {1}".format(line, trigram) trigrams.setdefault(trigram, 0) trigrams[trigram] += 1 trigram_series = Series(trigrams.values(), index=trigrams.keys()) trigram_series.sort(inplace=True, ascending=True) print trigram_series print "quartiles:\n{0}".format( trigram_series.quantile([.25, .50, .75, .99]).to_string()) print "median is: {0}".format(trigram_series.median()) unique_trigrams = [] for trigram, count in trigrams.iteritems(): if count > trigram_series.quantile(.50): unique_trigrams.append(trigram) unique_trigrams.append(trigram) print "saving trigrams" with open("trigrams.json", "w") as f: json.dump(unique_trigrams, f) print "saved {0} trigrams".format(len(unique_trigrams)) trie = {} for trigram in unique_trigrams:
#********************************** # Set ABOVE #********************************** def parse_file(filepath): trace = Ftrace(filepath) return (filepath, trace) if __name__ == '__main__': _files = glob.glob(r'{path}\*{file_ext}'.format(path=PATH, file_ext=FILE_EXT)) F_DICT = {_fp: os.path.split(_fp)[1].split('.')[0] for _fp in _files} sb_all = DataFrame(columns=F_DICT.values()) for _file in _files: fp, trace = parse_file(_file) total_duration = trace.duration if INTERVAL is None else INTERVAL ss = Series((event.interval.duration for event in trace.android.render_frame_intervals(interval=INTERVAL))) ss = ss * 1000. # summary = ss.describe() summary['90%'] = ss.quantile(.9) summary['Janks'] = trace.android.num_janks(interval=INTERVAL) summary['Janks Per Second'] = summary['Janks']/total_duration summary['Average FPS'] = trace.android.framerate(interval=INTERVAL) sb_all[F_DICT[fp]] = summary sb_all.to_csv(r'{path}\frame_stats.csv'.format(path=PATH))
def min_max(column: pd.Series) -> tuple: """get the min and max values of a series""" return tuple(column.quantile([0, 1]).squeeze())
if __name__ == "__main__": _files = glob.glob(r"{path}\*{file_ext}".format(path=PATH, file_ext=FILE_EXT)) F_DICT = {_fp: os.path.split(_fp)[1].split(".")[0] for _fp in _files} sb_all = DataFrame() for _file in _files: fp, trace = parse_file(_file) total_duration = trace.duration if INTERVAL is None else INTERVAL ss = Series( (event.interval.duration * 1000 for event in trace.android.input_latencies(TOUCH_IRQ, interval=INTERVAL)) ) summary = ss.describe() summary["90%"] = ss.quantile(0.9) summary["Janks Per Second"] = trace.android.jankrate(interval=INTERVAL) summary["Average FPS"] = trace.android.framerate(interval=INTERVAL) ss_first = Series( ( event.interval.duration * 1000 for event in trace.android.input_latencies(TOUCH_IRQ, interval=INTERVAL) if trace.cpu.frequency_intervals(cpu=0, interval=event.interval) and trace.cpu.frequency_intervals(cpu=0, interval=event.interval)[0] == 384000 ) ) summary_first = ss_first.describe() summary_first["90%"] = ss_first.quantile(0.9) summary_first["Janks Per Second"] = summary["Janks Per Second"] summary_first["Average FPS"] = summary["Average FPS"]
with open('words.txt', 'r') as f: lines = f.readlines() trigrams = {} for line in lines: trigram = line.strip().lower()[0:3] if len(trigram) >= 3 and not nonalphabet.search(trigram): if trigram == "aaa": print "line: {0} trigram: {1}".format(line, trigram) trigrams.setdefault(trigram, 0) trigrams[trigram] += 1 trigram_series = Series(trigrams.values(), index=trigrams.keys()) trigram_series.sort(inplace=True, ascending=True) print trigram_series print "quartiles:\n{0}".format(trigram_series.quantile([.25, .50, .75, .99]).to_string()) print "median is: {0}".format(trigram_series.median()) unique_trigrams = [] for trigram, count in trigrams.iteritems(): if count > trigram_series.quantile(.50): unique_trigrams.append(trigram) unique_trigrams.append(trigram) print "saving trigrams" with open("trigrams.json", "w") as f: json.dump(unique_trigrams, f) print "saved {0} trigrams".format(len(unique_trigrams)) trie = {} for trigram in unique_trigrams:
def get_iqr(s: pd.Series): """ Calculate interquartile range (IQR) of the `s` sample. """ q1 = s.quantile(0.25) q3 = s.quantile(0.75) iqr = q3 - q1 return q1, q3, iqr
# 针对数值属性:绘分位数图 for i in DataTable.columns: if i in NumericAttribute: DataColumn = DataTable[i] # 获取该列 QuantileSequence = DataColumn.quantile(numpy.arange(0, 1, 0.01)) # 获取0%到100%的分位数 QuantileSequence.plot(title='属性' + i + '分位数图') #绘制数据的分位数图 GaussianDistribution = Series( numpy.random.normal(loc=DataColumn.mean(), scale=numpy.sqrt(DataColumn.var()), size=1000)) #以均值和标准差生成1000个高斯样本 GaussianDistribution.quantile(numpy.arange(0, 1, 0.01)).plot() #绘制高斯样本的分位数图 pyplot.show() # pyplot.draw() # pyplot.pause(0.1) # pyplot.close(); else: print('分位数图绘制完成') # 针对数值属性:绘制盒图 DataTable.boxplot(column=NumericAttribute) pyplot.xlabel('各属性列') pyplot.ylabel('离群点与盒图') pyplot.show() # 处理缺失值:将缺失部分剔除
def parse_file(filepath): trace = Ftrace(filepath) return (filepath, trace) if __name__ == '__main__': _files = glob.glob(r'{path}\*{file_ext}'.format(path=PATH, file_ext=FILE_EXT)) F_DICT = {_fp: os.path.split(_fp)[1].split('.')[0] for _fp in _files} sb_all = DataFrame(columns=F_DICT.values()) for _file in _files: fp, trace = parse_file(_file) total_duration = trace.duration if INTERVAL is None else INTERVAL ss = Series((event.interval.duration for event in trace.android.render_frame_intervals( interval=INTERVAL))) ss = ss * 1000. # summary = ss.describe() summary['90%'] = ss.quantile(.9) summary['Janks'] = trace.android.num_janks(interval=INTERVAL) summary['Janks Per Second'] = summary['Janks'] / total_duration summary['Average FPS'] = trace.android.framerate(interval=INTERVAL) sb_all[F_DICT[fp]] = summary sb_all.to_csv(r'{path}\frame_stats.csv'.format(path=PATH))
def scan(self, coordinates: pd.DataFrame, expectations: pd.Series, outcomes: pd.Series, penalty: float, num_iters: int, verbose: bool = False, seed: int = 0, mode: str = 'binary'): """ :param coordinates: data frame containing having as columns the covariates/features :param expectations: data series containing the expectations/expected outcomes :param outcomes: data series containing the outcomes/observed outcomes :param penalty: penalty coefficient :param num_iters: number of iteration :param verbose: logging flag :param seed: numpy seed. Default equals 0 :param mode: one of ['binary', 'continuous', 'nominal', 'ordinal']. Defaults to binary. :return: [best subset, best score] """ np.random.seed(seed) # Check that the appropriate scoring function is used if isinstance(self.scoring_function, BerkJones): modes = ["binary", "continuous", "nominal", "ordinal"] assert mode in modes, f"Expected one of {modes} for BerkJones, got {mode}." # Ensure that BerkJones only work in Autostrat mode unique_expectations = expectations.unique() if isinstance(self.scoring_function, BerkJones) and len(unique_expectations) != 1: raise Exception( "BerkJones scorer supports scanning in autostrat mode only." ) # Bin the continuous outcomes column for Berk Jones in continuous mode alpha = self.scoring_function.alpha direction = self.scoring_function.direction if mode == "continuous": quantile = outcomes.quantile(alpha) outcomes = (outcomes > quantile).apply(int) # Flip outcomes to scan in the negative direction for BerkJones # This is equivalent to switching the p-values if direction == "negative": outcomes = 1 - outcomes if isinstance(self.scoring_function, Bernoulli): modes = ["binary", "nominal"] assert mode in modes, f"Expected one of {modes} for Bernoulli, got {mode}." if isinstance(self.scoring_function, Gaussian): assert mode == 'continuous', f"Expected continuous, got {mode}." # Set variance for Gaussian self.scoring_function.var = expectations.var() # Move entire distribution to the positive axis shift = np.abs(expectations.min()) + np.abs(outcomes.min()) outcomes = outcomes + shift expectations = expectations + shift if isinstance(self.scoring_function, Poisson): modes = ["binary", "ordinal"] assert mode in modes, f"Expected one of {modes} for Poisson, got {mode}." # initialize best_subset = {} best_score = -1e10 best_scores = [] for i in range(num_iters): # flags indicates that the method has optimized over subsets for a given attribute. # The iteration ends when it cannot further increase score by optimizing over # subsets of any attribute, i.e., when all flags are 1. flags = np.empty(len(coordinates.columns)) flags.fill(0) # Starting subset. Note that we start with all values for the first iteration # and random values for succeeding iterations. current_subset = get_entire_subset() if (i == 0) \ else get_random_subset(coordinates, np.random.rand(1).item(), 10) # score the entire population current_score = self.score_current_subset( coordinates=coordinates, expectations=expectations, outcomes=outcomes, penalty=penalty, current_subset=current_subset ) while flags.sum() < len(coordinates.columns): # choose random attribute that we haven't scanned yet attribute_number_to_scan = np.random.choice(len(coordinates.columns)) while flags[attribute_number_to_scan]: attribute_number_to_scan = np.random.choice(len(coordinates.columns)) attribute_to_scan = coordinates.columns.values[attribute_number_to_scan] # clear current subset of attribute values for that subset if attribute_to_scan in current_subset: del current_subset[attribute_to_scan] # call get_aggregates and choose_aggregates to find best subset of attribute values aggregates, thresholds, all_observed_sum, all_expectations = self.get_aggregates( coordinates=coordinates, outcomes=outcomes, expectations=expectations, current_subset=current_subset, column_name=attribute_to_scan, penalty=penalty ) temp_names, temp_score = self.choose_aggregates( aggregates=aggregates, thresholds=thresholds, penalty=penalty, all_observed_sum=all_observed_sum, all_expectations=all_expectations ) temp_subset = current_subset.copy() # if temp_names is not empty (or null) if temp_names: temp_subset[attribute_to_scan] = temp_names # Note that this call to score_current_subset ensures that # we are penalizing complexity for all attribute values. # The value of temp_score computed by choose_aggregates # above includes only the penalty for the current attribute. temp_score = self.score_current_subset( coordinates=coordinates, expectations=expectations, outcomes=outcomes, penalty=penalty, current_subset=temp_subset ) # reset flags to 0 if we have improved score if temp_score > current_score + 1E-6: flags.fill(0) # sanity check to make sure score has not decreased # sanity check may not apply to Gaussian in penalized mode (TODO: to check Maths again) if not isinstance(self.scoring_function, Gaussian) and penalty > 0: assert ( temp_score >= current_score - 1e-6 ), "WARNING SCORE HAS DECREASED from %.6f to %.6f" % ( current_score, temp_score, ) flags[attribute_number_to_scan] = 1 current_subset = temp_subset current_score = temp_score # print out results for current iteration if verbose: print("Subset found on iteration", i + 1, "of", num_iters, "with score", current_score, ":") print(current_subset) # update best_score and best_subset if necessary if current_score > best_score: best_subset = current_subset.copy() best_score = current_score if verbose: print("Best score is now", best_score) elif verbose: print("Current score of", current_score, "does not beat best score of", best_score) best_scores.append(best_score) return best_subset, best_score
def describe_numeric_1d(series: pd.Series, summary: dict) -> Tuple[pd.Series, dict]: """Describe a numeric series. Args: series: The Series to describe. summary: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # Config chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get( float) quantiles = config["vars"]["num"]["quantiles"].get(list) value_counts = summary["value_counts_without_nan"] summary["n_zeros"] = 0 infinity_values = [np.inf, -np.inf] infinity_index = value_counts.index.isin(infinity_values) summary["n_infinite"] = value_counts.loc[infinity_index].sum() if 0 in value_counts.index: summary["n_zeros"] = value_counts.loc[0] stats = summary if isinstance(series.dtype, _IntegerDtype): stats.update(numeric_stats_pandas(series)) present_values = series.astype(str(series.dtype).lower()) finite_values = present_values else: present_values = series.values finite_values = present_values[np.isfinite(present_values)] stats.update(numeric_stats_numpy(present_values, series, summary)) stats.update({ "mad": mad(present_values), }) if chi_squared_threshold > 0.0: stats["chi_squared"] = chi_square(finite_values) stats["range"] = stats["max"] - stats["min"] stats.update({ f"{percentile:.0%}": value for percentile, value in series.quantile(quantiles).to_dict().items() }) stats["iqr"] = stats["75%"] - stats["25%"] stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN stats["p_zeros"] = stats["n_zeros"] / summary["n"] stats["p_infinite"] = summary["n_infinite"] / summary["n"] stats["monotonic_increase"] = series.is_monotonic_increasing stats["monotonic_decrease"] = series.is_monotonic_decreasing stats["monotonic_increase_strict"] = (stats["monotonic_increase"] and series.is_unique) stats["monotonic_decrease_strict"] = (stats["monotonic_decrease"] and series.is_unique) stats.update( histogram_compute( value_counts[~infinity_index].index.values, summary["n_distinct"], weights=value_counts[~infinity_index].values, )) return series, stats
def quantile_975(x: pd.Series) -> float: return x.quantile(0.975)
def count_estims(dist, gamma = 0.95): ''' Counts all estimates :param dist: dsitribution :param gamma: probability of realisation of value :return point: point estimates :return interval: confidance intervals for point estimates ''' import numpy as np x = Series(dist) #Точечные оценки point = {} N = x.count() med_ = med_u(x)# med = np.median(dist) mad = x.mad()# mean_c = mean(dist)# var = np.var(dist) std = np.std(dist) mod = stats.mode(dist).mode# kurt = stats.kurtosis(dist) skew_my = stats.skew(dist)# Chi = 1/np.sqrt(np.abs(kurt)) quantiles = np.round(x.quantile([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]), 5) W = std/mean_c;# quantiles_str = "" for index in quantiles.index: quantiles_str+='<p><pre>{0}\t{1}</pre></p>'.format(index, quantiles[index]) point['MED'] = np.round(med, 5) point['MED*'] = np.round(med_, 5) point['MAD'] = np.round(mad, 5) point['Min'] = np.round(x.min(), 5) point['Max'] = np.round(x.max(), 5) point['Mean'] = np.round(mean_c, 5) point['S^2'] = np.round(var, 5) point['S'] = np.round(std, 5) point['MOD'] = np.round(mod, 5) point['E'] = np.round(kurt, 5) point['A'] = np.round(skew_my, 5) point['Chi'] = np.round(Chi, 5) point['X(alpha)'] = quantiles_str point['W'] = np.round(W, 5) #Интервальные оценки from scipy.stats import t, norm import numpy as np interval = {} if N < 61: l = t.ppf((1-gamma)/2, N-1) u = t.ppf(1-(1-gamma)/2, N-1) else: l = norm.ppf((1-gamma)/2) u = norm.ppf(1-(1-gamma)/2) X_cf = (mean_c+l*sigma_X(x), mean_c+u*sigma_X(x)) A_cf = (skew_my + l * sigma_A(x), skew_my + u * sigma_A(x)) S_cf = (std + l*sigma_S(x), std+u*sigma_S(x)) E_cf = (kurt + l*sigma_E(x), kurt+u*sigma_E(x)) if W < 1: v = l/np.sqrt(2*(N-1)) W_cf = np.round((W/(1+v*np.sqrt(1+2*W**2)), W/(1-v*np.sqrt(1+2*W**2))), 5) else: W_cf = (None, None) interval['Mean'] = np.round(X_cf, 5) interval['S'] = np.round(S_cf, 5) interval['E'] = np.round(E_cf, 5) interval['A'] = np.round(A_cf, 5) interval['W'] = W_cf return point, interval
print(f"Std (calc): {std}; Std (pandas){x.std()}") #%% c) x_sorted = x.sort_values() if (x.size % 2 == 0): median = x_sorted.loc[x.size / 2] else: lower = math.floor(x.size / 2) upper = math.ceil(x.size / 2) median = (x[lower] + x[upper]) / 2 print(f"Mean (calc): {mean}; Mean (pandas){x.mean()}") #%% d) x.quantile(q=.75) #%% z = (x - x.mean()) / x.std() print(f"std = {round(z.std(),2)}") print(f"mean = {round(z.mean(),2)}") #%% Aufgabe 1.5 from pandas import Series, DataFrame import pandas as pd import matplotlib.pyplot as plt import numpy as np #%% a)
def quantile_025(x: pd.Series) -> float: return x.quantile(0.025)
def describe_numeric_1d(series: pd.Series) -> dict: def mad(arr): """ Median Absolute Deviation: a "Robust" version of standard deviation. Indices variability of the sample. https://en.wikipedia.org/wiki/Median_absolute_deviation """ return np.median(np.abs(arr - np.median(arr))) stats = {} # number of observations in the Series stats["num_rows_total"] = len(series) # number of non-NaN observations in the Series stats["num_rows_with_data"] = series.count() # distinct count value_counts_with_nan = series.value_counts(dropna=False) value_counts_without_nan = series.value_counts(dropna=True) stats["distinct_count_with_nan"] = value_counts_with_nan.count() stats["distinct_count_without_nan"] = value_counts_without_nan.count() stats["distinct_count"] = stats["distinct_count_without_nan"] # values stats["n_values"] = stats["num_rows_with_data"] stats["p_values"] = 100 * (stats["num_rows_with_data"] / stats["num_rows_total"]) # missing stats["n_missing"] = stats["num_rows_total"] - stats["num_rows_with_data"] stats["p_missing"] = 100 * ( 1 - (stats["num_rows_with_data"] / stats["num_rows_total"])) # stats["is_unique"] = stats["distinct_count_without_nan"] == stats[ "num_rows_with_data"] # values = series.values present_values = values[~np.isnan(values)] stats["mean"] = np.mean(present_values) stats["mode"] = series.mode( ).iloc[0] if stats["num_rows_with_data"] > stats[ "distinct_count_without_nan"] > 1 else series[0] stats["std"] = np.std(present_values, ddof=1) stats["variance"] = np.var(present_values, ddof=1) # Median Absolute Deviation stats["mad"] = mad(present_values) stats["min"] = np.min(present_values) stats["max"] = np.max(present_values) stats["range"] = stats["max"] - stats["min"] # Unbiased kurtosis obtained using Fisher's definition (kurtosis of normal == 0.0). Normalized by N-1. stats["kurtosis"] = series.kurt() # Unbiased skew normalized by N-1 stats["skewness"] = series.skew() # zeros stats["n_zeros"] = (stats["num_rows_with_data"] - np.count_nonzero(present_values)) stats["p_zeros"] = 100 * (stats["n_zeros"] / stats["num_rows_total"]) # quantiles quantiles = [.05, .25, .5, .75, .95] stats.update({ f"{percentile:.0%}": value for percentile, value in series.quantile(quantiles).to_dict().items() }) stats["iqr"] = stats["75%"] - stats["25%"] # outliers stats["n_outlier_top"] = len( series[series > (stats["75%"] + 1.5 * stats["iqr"])]) stats["n_outlier_bottom"] = len( series[series < (stats["25%"] - 1.5 * stats["iqr"])]) stats["n_outlier"] = stats["n_outlier_top"] + stats["n_outlier_bottom"] stats["p_outlier_top"] = 100 * (stats["n_outlier_top"] / stats["num_rows_total"]) stats["p_outlier_bottom"] = 100 * (stats["n_outlier_bottom"] / stats["num_rows_total"]) stats["p_outlier"] = 100 * (stats["n_outlier"] / stats["num_rows_total"]) for key, value in stats.items(): try: stats[key] = round(value, 2) except: pass return stats