Пример #1
0
 def summary_df(self, thresholds=None, lower_quantile=None, upper_quantile=None):
     """
     Calculates the pair of metrics for each threshold for each result.
     """
     if thresholds is None:
         thresholds = self.thresholds
     if lower_quantile is None:
         lower_quantile = self.config['lower_quantile']
     if upper_quantile is None:
         upper_quantile = self.config['upper_quantile']
     
     if self.n_current_results > self.n_cached_curves:
         # If there are new curves, recompute
         colnames = ['_'.join([metric, stat])
                     for metric in [self.metric1.name, self.metric2.name] 
                     for stat in ['Mean', 'Median',
                                  '%d_Percentile' % (100*lower_quantile),
                                  '%d_Percentile' % (upper_quantile*100)]]
         self.ret = pd.DataFrame(columns=colnames, index=thresholds, dtype='float64')
         
         for threshold in thresholds:
             m1s = Series([self.metric1.score(result, threshold) for result in self.results])
             m2s = Series([self.metric2.score(result, threshold) for result in self.results])
             self.ret.loc[threshold] = (m1s.mean(), m1s.quantile(.5), m1s.quantile(.05), m1s.quantile(.95),
                                        m2s.mean(), m2s.quantile(.5), m2s.quantile(.05), m2s.quantile(.95))
     return self.ret
def get_cutoffs(x, num_groups=10):
    """Get the cutoffs that splits `x` into `num_groups` equally sized groups."""
    series = Series(x)
    cutoffs = []
    for i in range(num_groups):
        perc_low = float(i) / num_groups
        perc_high = float(i + 1) / num_groups
        cutoffs.append((series.quantile(perc_low), series.quantile(perc_high)))
    return cutoffs
Пример #3
0
 def summary_df(self):
     lower_quantile = self.config['lower_quantile']
     upper_quantile = self.config['upper_quantile']
     
     vals = Series(self.summary)
     
     lower_bound = vals.quantile(lower_quantile)
     upper_bound = vals.quantile(upper_quantile)
     median = vals.quantile(0.5)
     mean = vals.mean()
     
     column_names = [ "Mean" , "Median" , "%d_Percentile" % (lower_quantile*100), "%d_Percentile" % (upper_quantile*100)]
     df = pd.DataFrame(dict(zip(column_names, [mean, median, lower_bound, upper_bound])), index=[0])
     
     return df
Пример #4
0
    def test_quantile_empty(self):

        # floats
        s = Series([], dtype='float64')

        res = s.quantile(0.5)
        self.assertTrue(np.isnan(res))

        res = s.quantile([0.5])
        exp = Series([np.nan], index=[0.5])
        tm.assert_series_equal(res, exp)

        # int
        s = Series([], dtype='int64')

        res = s.quantile(0.5)
        self.assertTrue(np.isnan(res))

        res = s.quantile([0.5])
        exp = Series([np.nan], index=[0.5])
        tm.assert_series_equal(res, exp)

        # datetime
        s = Series([], dtype='datetime64[ns]')

        res = s.quantile(0.5)
        self.assertTrue(res is pd.NaT)

        res = s.quantile([0.5])
        exp = Series([pd.NaT], index=[0.5])
        tm.assert_series_equal(res, exp)
    def describe_numeric_1d(series: pd.Series,
                            series_description: dict) -> dict:
        """Describe a numeric series.
        Args:
            series: The Series to describe.
            series_description: The dict containing the series description so far.
        Returns:
            A dict containing calculated series description values.
        Notes:
            When 'bins_type' is set to 'bayesian_blocks', astropy.stats.bayesian_blocks is used to determine the number of
            bins. Read the docs:
            https://docs.astropy.org/en/stable/visualization/histogram.html
            https://docs.astropy.org/en/stable/api/astropy.stats.bayesian_blocks.html
            This method might print warnings, which we suppress.
            https://github.com/astropy/astropy/issues/4927
        """
        def mad(arr):
            """ Median Absolute Deviation: a "Robust" version of standard deviation.
                Indices variability of the sample.
                https://en.wikipedia.org/wiki/Median_absolute_deviation
            """
            return np.median(np.abs(arr - np.median(arr)))

        quantiles = config["vars"]["num"]["quantiles"].get(list)

        n_infinite = ((series == np.inf) | (series == -np.inf)).sum()

        if isinstance(series.dtype, _IntegerDtype):
            stats = numeric_stats_pandas(series)
            present_values = series.loc[series.notnull()].astype(
                str(series.dtype).lower())
            stats["n_zeros"] = series_description["count"] - np.count_nonzero(
                present_values)
            stats["histogram_data"] = present_values
            finite_values = present_values
        else:
            values = series.values
            present_values = values[~np.isnan(values)]
            finite_values = values[np.isfinite(values)]
            stats = numeric_stats_numpy(present_values)
            stats["histogram_data"] = finite_values

        stats.update({
            "mad": mad(present_values),
            "scatter_data": series,  # For complex
            "p_infinite": n_infinite / series_description["n"],
            "n_infinite": n_infinite,
        })

        chi_squared_threshold = config["vars"]["num"][
            "chi_squared_threshold"].get(float)
        if chi_squared_threshold > 0.0:
            histogram, _ = np.histogram(finite_values, bins="auto")
            stats["chi_squared"] = chisquare(histogram)

        stats["range"] = stats["max"] - stats["min"]
        stats.update({
            f"{percentile:.0%}": value
            for percentile, value in series.quantile(
                quantiles).to_dict().items()
        })
        stats["iqr"] = stats["75%"] - stats["25%"]
        stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN
        stats["p_zeros"] = stats["n_zeros"] / series_description["n"]

        stats["monotonic_increase"] = series.is_monotonic_increasing
        stats["monotonic_decrease"] = series.is_monotonic_decreasing

        stats["monotonic_increase_strict"] = (stats["monotonic_increase"]
                                              and series.is_unique)
        stats["monotonic_decrease_strict"] = (stats["monotonic_decrease"]
                                              and series.is_unique)

        stats.update(
            histogram_compute(finite_values, series_description["n_unique"]))

        return stats
Пример #6
0
 def get(self, data: pd.Series) -> List[float]:
     return data.quantile((np.arange(self.bins - 1) + 1) /
                          self.bins).drop_duplicates().tolist()
Пример #7
0
def describe_numeric_1d(series: pd.Series, series_description: dict) -> dict:
    """Describe a numeric series.

    Args:
        series: The Series to describe.
        series_description: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.

    Notes:
        When 'bins_type' is set to 'bayesian_blocks', astropy.stats.bayesian_blocks is used to determine the number of
        bins. Read the docs:
        https://docs.astropy.org/en/stable/visualization/histogram.html
        https://docs.astropy.org/en/stable/api/astropy.stats.bayesian_blocks.html

        This method might print warnings, which we suppress.
        https://github.com/astropy/astropy/issues/4927
    """
    quantiles = config["vars"]["num"]["quantiles"].get(list)

    stats = {
        "mean": series.mean(),
        "std": series.std(),
        "variance": series.var(),
        "min": series.min(),
        "max": series.max(),
        "kurtosis": series.kurt(),
        "skewness": series.skew(),
        "sum": series.sum(),
        "mad": series.mad(),
        "n_zeros": (len(series) - np.count_nonzero(series)),
        "histogramdata": series,
    }

    stats["range"] = stats["max"] - stats["min"]
    stats.update({
        "{:.0%}".format(percentile): value
        for percentile, value in series.quantile(quantiles).to_dict().items()
    })
    stats["iqr"] = stats["75%"] - stats["25%"]
    stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN
    stats["p_zeros"] = float(stats["n_zeros"]) / len(series)

    bins = config["plot"]["histogram"]["bins"].get(int)
    # Bins should never be larger than the number of distinct values
    bins = min(series_description["distinct_count_with_nan"], bins)
    stats["histogram_bins"] = bins

    bayesian_blocks_bins = config["plot"]["histogram"][
        "bayesian_blocks_bins"].get(bool)
    if bayesian_blocks_bins:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            ret = bayesian_blocks(stats["histogramdata"])

            # Sanity check
            if not np.isnan(ret).any() and ret.size > 1:
                stats["histogram_bins_bayesian_blocks"] = ret

    return stats
Пример #8
0
import pandas as pd
from pandas import Series

fig = plt.figure()
ax = fig.add_subplot(211)

data = pd.read_csv('time.txt', sep='\n', header=None)
data = Series(data[0])

data = data[data < 300]

data = data.sort_values()
#data.to_csv('zz.txt', index=False)

t = data.values

ax.hist(x=t, bins=100, normed=True)

l = float(1) / data.mean()
p = l * np.power(math.e, -l * t)

ax.plot(t, p)
print data.describe()

fig.show()

x1 = data.quantile(0.25)
x3 = data.quantile(0.75)

print '中间1/2的均值: ', data[data > x1][data < x3].mean()
Пример #9
0
def numerical_summary(
    series: pd.Series,
    quantiles=(0.05, 0.25, 0.5, 0.75, 0.95),
    count=None,
    is_unique=None,
    return_values=False,
) -> Union[dict, Tuple[dict, Any]]:
    """

    Args:
        series: series to summarize

    Returns:

    """

    if count is None:
        count = series.count()

    values = series.values
    present_values = values[~np.isnan(values)]
    finite_mask = np.isfinite(present_values)
    finite_values = present_values[finite_mask]

    summary = {
        "mean": np.mean(present_values),
        "std": np.std(present_values, ddof=1),
        "min": np.min(present_values),
        "max": np.max(present_values),
        # Unbiased kurtosis obtained using Fisher's definition (kurtosis of normal == 0.0). Normalized by N-1.
        "kurt": series.kurt(),
        # Unbiased skew normalized by N-1
        "skew": series.skew(),
        "sum": np.sum(present_values),
        "n_infinite": (~finite_mask).sum(),
        "n_zeros": (count - np.count_nonzero(present_values)),
    }

    for percentile, value in series.quantile(quantiles).to_dict().items():
        summary["quantile_{:d}".format(int(percentile * 100))] = value
    summary["median"] = summary["quantile_50"]
    summary["iqr"] = summary["quantile_75"] - summary["quantile_25"]

    summary["mad"] = mad(present_values, summary["quantile_50"])
    summary["variance"] = summary["std"] ** 2
    summary["cv"] = summary["std"] / summary["mean"] if summary["mean"] else np.NaN
    summary["range"] = summary["max"] - summary["min"]

    summary["monotonic_increase"] = series.is_monotonic_increasing
    summary["monotonic_decrease"] = series.is_monotonic_decreasing

    summary["monotonic_increase_strict"] = (
        summary["monotonic_increase"] and series.is_unique
    )
    summary["monotonic_decrease_strict"] = (
        summary["monotonic_decrease"] and series.is_unique
    )

    if return_values:
        return summary, finite_values

    return summary
 def test_quantile_sparse(self, values, dtype):
     ser = Series(values, dtype=dtype)
     result = ser.quantile([0.5])
     expected = Series(np.asarray(ser)).quantile([0.5])
     tm.assert_series_equal(result, expected)
Пример #11
0
    def describe_numeric_1d(series: pd.Series,
                            series_description: dict) -> dict:
        """Describe a numeric series.
        Args:
            series: The Series to describe.
            series_description: The dict containing the series description so far.
        Returns:
            A dict containing calculated series description values.
        Notes:
            When 'bins_type' is set to 'bayesian_blocks', astropy.stats.bayesian_blocks is used to determine the number of
            bins. Read the docs:
            https://docs.astropy.org/en/stable/visualization/histogram.html
            https://docs.astropy.org/en/stable/api/astropy.stats.bayesian_blocks.html
            This method might print warnings, which we suppress.
            https://github.com/astropy/astropy/issues/4927
        """
        def mad(arr):
            """ Median Absolute Deviation: a "Robust" version of standard deviation.
                Indices variability of the sample.
                https://en.wikipedia.org/wiki/Median_absolute_deviation
            """
            return np.median(np.abs(arr - np.median(arr)))

        quantiles = config["vars"]["num"]["quantiles"].get(list)

        n_infinite = ((series == np.inf) | (series == -np.inf)).sum()

        values = series.values
        present_values = values[~np.isnan(values)]
        finite_values = values[np.isfinite(values)]

        stats = {
            "mean":
            np.mean(present_values),
            "std":
            np.std(present_values, ddof=1),
            "variance":
            np.var(present_values, ddof=1),
            "min":
            np.min(present_values),
            "max":
            np.max(present_values),
            # Unbiased kurtosis obtained using Fisher's definition (kurtosis of normal == 0.0). Normalized by N-1.
            "kurtosis":
            series.kurt(),
            # Unbiased skew normalized by N-1
            "skewness":
            series.skew(),
            "sum":
            np.sum(present_values),
            "mad":
            mad(present_values),
            "n_zeros":
            (series_description["count"] - np.count_nonzero(present_values)),
            "histogram_data":
            finite_values,
            "scatter_data":
            series,  # For complex
            "p_infinite":
            n_infinite / series_description["n"],
            "n_infinite":
            n_infinite,
        }

        chi_squared_threshold = config["vars"]["num"][
            "chi_squared_threshold"].get(float)
        if chi_squared_threshold > 0.0:
            histogram, _ = np.histogram(finite_values, bins="auto")
            stats["chi_squared"] = chisquare(histogram)

        stats["range"] = stats["max"] - stats["min"]
        stats.update({
            f"{percentile:.0%}": value
            for percentile, value in series.quantile(
                quantiles).to_dict().items()
        })
        stats["iqr"] = stats["75%"] - stats["25%"]
        stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN
        stats["p_zeros"] = stats["n_zeros"] / series_description["n"]

        bins = config["plot"]["histogram"]["bins"].get(int)
        # Bins should never be larger than the number of distinct values
        bins = min(series_description["distinct_count_with_nan"], bins)
        stats["histogram_bins"] = bins

        bayesian_blocks_bins = config["plot"]["histogram"][
            "bayesian_blocks_bins"].get(bool)
        if bayesian_blocks_bins:
            from astropy.stats import bayesian_blocks

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                ret = bayesian_blocks(stats["histogram_data"])

                # Sanity check
                if not np.isnan(ret).any() and ret.size > 1:
                    stats["histogram_bins_bayesian_blocks"] = ret

        return stats
Пример #12
0
    lines = f.readlines()

trigrams = {}
for line in lines:
    trigram = line.strip().lower()[0:3]
    if len(trigram) >= 3 and not nonalphabet.search(trigram):
        if trigram == "aaa":
            print "line: {0} trigram: {1}".format(line, trigram)
        trigrams.setdefault(trigram, 0)
        trigrams[trigram] += 1

trigram_series = Series(trigrams.values(), index=trigrams.keys())
trigram_series.sort(inplace=True, ascending=True)
print trigram_series
print "quartiles:\n{0}".format(
    trigram_series.quantile([.25, .50, .75, .99]).to_string())

print "median is: {0}".format(trigram_series.median())
unique_trigrams = []
for trigram, count in trigrams.iteritems():
    if count > trigram_series.quantile(.50):
        unique_trigrams.append(trigram)
    unique_trigrams.append(trigram)

print "saving trigrams"
with open("trigrams.json", "w") as f:
    json.dump(unique_trigrams, f)
print "saved {0} trigrams".format(len(unique_trigrams))

trie = {}
for trigram in unique_trigrams:
Пример #13
0
#**********************************
# Set ABOVE
#**********************************


def parse_file(filepath):
    trace = Ftrace(filepath)
    return (filepath, trace)

if __name__ == '__main__':
    _files = glob.glob(r'{path}\*{file_ext}'.format(path=PATH, file_ext=FILE_EXT))
    F_DICT = {_fp: os.path.split(_fp)[1].split('.')[0] for _fp in _files}
    
    sb_all = DataFrame(columns=F_DICT.values())
    
    for _file in _files:
        
        fp, trace = parse_file(_file)

        total_duration = trace.duration if INTERVAL is None else INTERVAL
        ss = Series((event.interval.duration for event in trace.android.render_frame_intervals(interval=INTERVAL)))
        ss = ss * 1000. #
        summary = ss.describe()
        summary['90%'] = ss.quantile(.9)
        summary['Janks'] = trace.android.num_janks(interval=INTERVAL)
        summary['Janks Per Second'] = summary['Janks']/total_duration
        summary['Average FPS'] = trace.android.framerate(interval=INTERVAL)
        sb_all[F_DICT[fp]] = summary
    
    sb_all.to_csv(r'{path}\frame_stats.csv'.format(path=PATH))
        
Пример #14
0
def min_max(column: pd.Series) -> tuple:
    """get the min and max values of a series"""
    return tuple(column.quantile([0, 1]).squeeze())
Пример #15
0
if __name__ == "__main__":
    _files = glob.glob(r"{path}\*{file_ext}".format(path=PATH, file_ext=FILE_EXT))
    F_DICT = {_fp: os.path.split(_fp)[1].split(".")[0] for _fp in _files}

    sb_all = DataFrame()

    for _file in _files:

        fp, trace = parse_file(_file)

        total_duration = trace.duration if INTERVAL is None else INTERVAL
        ss = Series(
            (event.interval.duration * 1000 for event in trace.android.input_latencies(TOUCH_IRQ, interval=INTERVAL))
        )
        summary = ss.describe()
        summary["90%"] = ss.quantile(0.9)
        summary["Janks Per Second"] = trace.android.jankrate(interval=INTERVAL)
        summary["Average FPS"] = trace.android.framerate(interval=INTERVAL)

        ss_first = Series(
            (
                event.interval.duration * 1000
                for event in trace.android.input_latencies(TOUCH_IRQ, interval=INTERVAL)
                if trace.cpu.frequency_intervals(cpu=0, interval=event.interval)
                and trace.cpu.frequency_intervals(cpu=0, interval=event.interval)[0] == 384000
            )
        )
        summary_first = ss_first.describe()
        summary_first["90%"] = ss_first.quantile(0.9)
        summary_first["Janks Per Second"] = summary["Janks Per Second"]
        summary_first["Average FPS"] = summary["Average FPS"]
Пример #16
0
with open('words.txt', 'r') as f:
    lines = f.readlines()

trigrams = {}
for line in lines:
    trigram = line.strip().lower()[0:3]
    if len(trigram) >= 3 and not nonalphabet.search(trigram):
        if trigram == "aaa":
            print "line: {0} trigram: {1}".format(line, trigram)
        trigrams.setdefault(trigram, 0)
        trigrams[trigram] += 1

trigram_series = Series(trigrams.values(), index=trigrams.keys())
trigram_series.sort(inplace=True, ascending=True)
print trigram_series
print "quartiles:\n{0}".format(trigram_series.quantile([.25, .50, .75, .99]).to_string())

print "median is: {0}".format(trigram_series.median())
unique_trigrams = []
for trigram, count in trigrams.iteritems():
    if count > trigram_series.quantile(.50):
        unique_trigrams.append(trigram)
    unique_trigrams.append(trigram)

print "saving trigrams"
with open("trigrams.json", "w") as f:
    json.dump(unique_trigrams, f)
print "saved {0} trigrams".format(len(unique_trigrams))

trie = {}
for trigram in unique_trigrams:
Пример #17
0
def get_iqr(s: pd.Series):
    """ Calculate interquartile range (IQR) of the `s` sample. """
    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    iqr = q3 - q1
    return q1, q3, iqr
Пример #18
0
# 针对数值属性:绘分位数图
for i in DataTable.columns:
    if i in NumericAttribute:
        DataColumn = DataTable[i]
        # 获取该列
        QuantileSequence = DataColumn.quantile(numpy.arange(0, 1, 0.01))
        # 获取0%到100%的分位数
        QuantileSequence.plot(title='属性' + i + '分位数图')
        #绘制数据的分位数图
        GaussianDistribution = Series(
            numpy.random.normal(loc=DataColumn.mean(),
                                scale=numpy.sqrt(DataColumn.var()),
                                size=1000))
        #以均值和标准差生成1000个高斯样本
        GaussianDistribution.quantile(numpy.arange(0, 1, 0.01)).plot()
        #绘制高斯样本的分位数图
        pyplot.show()
        # pyplot.draw()
        # pyplot.pause(0.1)
        # pyplot.close();
else:
    print('分位数图绘制完成')

# 针对数值属性:绘制盒图
DataTable.boxplot(column=NumericAttribute)
pyplot.xlabel('各属性列')
pyplot.ylabel('离群点与盒图')
pyplot.show()

# 处理缺失值:将缺失部分剔除
Пример #19
0
def parse_file(filepath):
    trace = Ftrace(filepath)
    return (filepath, trace)


if __name__ == '__main__':
    _files = glob.glob(r'{path}\*{file_ext}'.format(path=PATH,
                                                    file_ext=FILE_EXT))
    F_DICT = {_fp: os.path.split(_fp)[1].split('.')[0] for _fp in _files}

    sb_all = DataFrame(columns=F_DICT.values())

    for _file in _files:

        fp, trace = parse_file(_file)

        total_duration = trace.duration if INTERVAL is None else INTERVAL
        ss = Series((event.interval.duration
                     for event in trace.android.render_frame_intervals(
                         interval=INTERVAL)))
        ss = ss * 1000.  #
        summary = ss.describe()
        summary['90%'] = ss.quantile(.9)
        summary['Janks'] = trace.android.num_janks(interval=INTERVAL)
        summary['Janks Per Second'] = summary['Janks'] / total_duration
        summary['Average FPS'] = trace.android.framerate(interval=INTERVAL)
        sb_all[F_DICT[fp]] = summary

    sb_all.to_csv(r'{path}\frame_stats.csv'.format(path=PATH))
Пример #20
0
    def scan(self, coordinates: pd.DataFrame, expectations: pd.Series, outcomes: pd.Series, penalty: float,
                    num_iters: int, verbose: bool = False, seed: int = 0, mode: str = 'binary'):
        """
        :param coordinates: data frame containing having as columns the covariates/features
        :param expectations: data series containing the expectations/expected outcomes
        :param outcomes: data series containing the outcomes/observed outcomes
        :param penalty: penalty coefficient
        :param num_iters: number of iteration
        :param verbose: logging flag
        :param seed: numpy seed. Default equals 0
        :param mode: one of ['binary', 'continuous', 'nominal', 'ordinal']. Defaults to binary.
        :return: [best subset, best score]
        """
        np.random.seed(seed)

        # Check that the appropriate scoring function is used

        if isinstance(self.scoring_function, BerkJones):
            modes = ["binary", "continuous", "nominal", "ordinal"]
            assert mode in modes, f"Expected one of {modes} for BerkJones,  got {mode}."

            # Ensure that BerkJones only work in Autostrat mode
            unique_expectations = expectations.unique()
            if isinstance(self.scoring_function, BerkJones) and len(unique_expectations) != 1:
                raise Exception(
                    "BerkJones scorer supports scanning in autostrat mode only."
                )

            # Bin the continuous outcomes column for Berk Jones in continuous mode
            alpha = self.scoring_function.alpha
            direction = self.scoring_function.direction
            
            if mode == "continuous":
                quantile = outcomes.quantile(alpha)
                outcomes = (outcomes > quantile).apply(int)

            # Flip outcomes to scan in the negative direction for BerkJones
            # This is equivalent to switching the p-values
            if direction == "negative":
                outcomes = 1 - outcomes

        if isinstance(self.scoring_function, Bernoulli):
            modes = ["binary", "nominal"]
            assert mode in modes, f"Expected one of {modes} for Bernoulli,  got {mode}."

        if isinstance(self.scoring_function, Gaussian):
            assert mode == 'continuous', f"Expected continuous, got {mode}."

             # Set variance for Gaussian
            self.scoring_function.var = expectations.var()
            
            # Move entire distribution to the positive axis
            shift = np.abs(expectations.min()) + np.abs(outcomes.min())
            outcomes = outcomes + shift
            expectations = expectations + shift

        if isinstance(self.scoring_function, Poisson):
            modes = ["binary", "ordinal"]
            assert mode in modes, f"Expected one of {modes} for Poisson,  got {mode}."

        # initialize
        best_subset = {}
        best_score = -1e10
        best_scores = []
        for i in range(num_iters):
            # flags indicates that the method has optimized over subsets for a given attribute.
            # The iteration ends when it cannot further increase score by optimizing over
            # subsets of any attribute, i.e., when all flags are 1.
            flags = np.empty(len(coordinates.columns))
            flags.fill(0)

            # Starting subset. Note that we start with all values for the first iteration
            # and random values for succeeding iterations.
            current_subset = get_entire_subset() if (i == 0) \
                else get_random_subset(coordinates, np.random.rand(1).item(), 10)

            # score the entire population
            current_score = self.score_current_subset(
                coordinates=coordinates,
                expectations=expectations,
                outcomes=outcomes,
                penalty=penalty,
                current_subset=current_subset
            )

            while flags.sum() < len(coordinates.columns):

                # choose random attribute that we haven't scanned yet
                attribute_number_to_scan = np.random.choice(len(coordinates.columns))
                while flags[attribute_number_to_scan]:
                    attribute_number_to_scan = np.random.choice(len(coordinates.columns))
                attribute_to_scan = coordinates.columns.values[attribute_number_to_scan]

                # clear current subset of attribute values for that subset
                if attribute_to_scan in current_subset:
                    del current_subset[attribute_to_scan]

                # call get_aggregates and choose_aggregates to find best subset of attribute values
                aggregates, thresholds, all_observed_sum, all_expectations = self.get_aggregates(
                    coordinates=coordinates,
                    outcomes=outcomes,
                    expectations=expectations,
                    current_subset=current_subset,
                    column_name=attribute_to_scan,
                    penalty=penalty
                )

                temp_names, temp_score = self.choose_aggregates(
                    aggregates=aggregates,
                    thresholds=thresholds,
                    penalty=penalty,
                    all_observed_sum=all_observed_sum,
                    all_expectations=all_expectations
                )

                temp_subset = current_subset.copy()
                # if temp_names is not empty (or null)
                if temp_names:
                    temp_subset[attribute_to_scan] = temp_names

                # Note that this call to score_current_subset ensures that
                # we are penalizing complexity for all attribute values.
                # The value of temp_score computed by choose_aggregates
                # above includes only the penalty for the current attribute.
                temp_score = self.score_current_subset(
                    coordinates=coordinates,
                    expectations=expectations,
                    outcomes=outcomes,
                    penalty=penalty,
                    current_subset=temp_subset
                )

                # reset flags to 0 if we have improved score
                if temp_score > current_score + 1E-6:
                    flags.fill(0)

                # sanity check to make sure score has not decreased
                # sanity check may not apply to Gaussian in penalized mode (TODO: to check Maths again)
                if not isinstance(self.scoring_function, Gaussian) and penalty > 0:
                    assert (
                        temp_score >= current_score - 1e-6
                    ), "WARNING SCORE HAS DECREASED from %.6f to %.6f" % (
                        current_score,
                        temp_score,
                    )
                    
                flags[attribute_number_to_scan] = 1
                current_subset = temp_subset
                current_score = temp_score

            # print out results for current iteration
            if verbose:
                print("Subset found on iteration", i + 1, "of", num_iters, "with score", current_score, ":")
                print(current_subset)

            # update best_score and best_subset if necessary
            if current_score > best_score:
                best_subset = current_subset.copy()
                best_score = current_score

                if verbose:
                    print("Best score is now", best_score)

            elif verbose:
                print("Current score of", current_score, "does not beat best score of", best_score)
            best_scores.append(best_score)
        return best_subset, best_score
Пример #21
0
def describe_numeric_1d(series: pd.Series,
                        summary: dict) -> Tuple[pd.Series, dict]:
    """Describe a numeric series.
    Args:
        series: The Series to describe.
        summary: The dict containing the series description so far.
    Returns:
        A dict containing calculated series description values.
    """

    # Config
    chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(
        float)
    quantiles = config["vars"]["num"]["quantiles"].get(list)

    value_counts = summary["value_counts_without_nan"]

    summary["n_zeros"] = 0

    infinity_values = [np.inf, -np.inf]
    infinity_index = value_counts.index.isin(infinity_values)
    summary["n_infinite"] = value_counts.loc[infinity_index].sum()

    if 0 in value_counts.index:
        summary["n_zeros"] = value_counts.loc[0]

    stats = summary

    if isinstance(series.dtype, _IntegerDtype):
        stats.update(numeric_stats_pandas(series))
        present_values = series.astype(str(series.dtype).lower())
        finite_values = present_values
    else:
        present_values = series.values
        finite_values = present_values[np.isfinite(present_values)]
        stats.update(numeric_stats_numpy(present_values, series, summary))

    stats.update({
        "mad": mad(present_values),
    })

    if chi_squared_threshold > 0.0:
        stats["chi_squared"] = chi_square(finite_values)

    stats["range"] = stats["max"] - stats["min"]
    stats.update({
        f"{percentile:.0%}": value
        for percentile, value in series.quantile(quantiles).to_dict().items()
    })
    stats["iqr"] = stats["75%"] - stats["25%"]
    stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN
    stats["p_zeros"] = stats["n_zeros"] / summary["n"]
    stats["p_infinite"] = summary["n_infinite"] / summary["n"]

    stats["monotonic_increase"] = series.is_monotonic_increasing
    stats["monotonic_decrease"] = series.is_monotonic_decreasing

    stats["monotonic_increase_strict"] = (stats["monotonic_increase"]
                                          and series.is_unique)
    stats["monotonic_decrease_strict"] = (stats["monotonic_decrease"]
                                          and series.is_unique)

    stats.update(
        histogram_compute(
            value_counts[~infinity_index].index.values,
            summary["n_distinct"],
            weights=value_counts[~infinity_index].values,
        ))

    return series, stats
def quantile_975(x: pd.Series) -> float:
    return x.quantile(0.975)
Пример #23
0
def count_estims(dist, gamma = 0.95):
    '''
    Counts all estimates
    :param dist: dsitribution
    :param gamma: probability of realisation of value
    :return point: point estimates
    :return interval: confidance intervals for point estimates
    '''
    import numpy as np
    x = Series(dist)
    #Точечные оценки
    point = {}
    N = x.count()

    med_ = med_u(x)#
    med = np.median(dist)
    mad = x.mad()#
    mean_c = mean(dist)#
    var = np.var(dist)
    std = np.std(dist)
    mod = stats.mode(dist).mode#
    kurt = stats.kurtosis(dist)
    skew_my = stats.skew(dist)#
    Chi = 1/np.sqrt(np.abs(kurt))
    quantiles = np.round(x.quantile([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]), 5)
    W = std/mean_c;#

    quantiles_str = ""
    for index in quantiles.index:
        quantiles_str+='<p><pre>{0}\t{1}</pre></p>'.format(index, quantiles[index])

    point['MED'] = np.round(med, 5)
    point['MED*'] = np.round(med_, 5)
    point['MAD'] = np.round(mad, 5)
    point['Min'] = np.round(x.min(), 5)
    point['Max'] = np.round(x.max(), 5)
    point['Mean'] = np.round(mean_c, 5)
    point['S^2'] = np.round(var, 5)
    point['S'] = np.round(std, 5)
    point['MOD'] = np.round(mod, 5)
    point['E'] = np.round(kurt, 5)
    point['A'] = np.round(skew_my, 5)
    point['Chi'] = np.round(Chi, 5)
    point['X(alpha)'] = quantiles_str
    point['W'] = np.round(W, 5)



    #Интервальные оценки
    from scipy.stats import t, norm
    import numpy as np
    interval = {}
    if N < 61:
        l = t.ppf((1-gamma)/2, N-1)
        u = t.ppf(1-(1-gamma)/2, N-1)
    else:
        l = norm.ppf((1-gamma)/2)
        u = norm.ppf(1-(1-gamma)/2)
    X_cf = (mean_c+l*sigma_X(x), mean_c+u*sigma_X(x))
    A_cf = (skew_my + l * sigma_A(x), skew_my + u * sigma_A(x))
    S_cf = (std + l*sigma_S(x), std+u*sigma_S(x))
    E_cf = (kurt + l*sigma_E(x), kurt+u*sigma_E(x))
    if W < 1:
        v = l/np.sqrt(2*(N-1))
        W_cf = np.round((W/(1+v*np.sqrt(1+2*W**2)), W/(1-v*np.sqrt(1+2*W**2))), 5)
    else: W_cf = (None, None)

    interval['Mean'] = np.round(X_cf, 5)
    interval['S'] = np.round(S_cf, 5)
    interval['E'] = np.round(E_cf, 5)
    interval['A'] = np.round(A_cf, 5)
    interval['W'] = W_cf

    return point, interval
Пример #24
0
print(f"Std (calc): {std}; Std (pandas){x.std()}")

#%% c)
x_sorted = x.sort_values()

if (x.size % 2 == 0):
    median = x_sorted.loc[x.size / 2]
else:
    lower = math.floor(x.size / 2)
    upper = math.ceil(x.size / 2)
    median = (x[lower] + x[upper]) / 2

print(f"Mean (calc): {mean}; Mean (pandas){x.mean()}")

#%% d)
x.quantile(q=.75)

#%%

z = (x - x.mean()) / x.std()

print(f"std = {round(z.std(),2)}")
print(f"mean = {round(z.mean(),2)}")

#%% Aufgabe 1.5
from pandas import Series, DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#%% a)
def quantile_025(x: pd.Series) -> float:
    return x.quantile(0.025)
Пример #26
0
def describe_numeric_1d(series: pd.Series) -> dict:
    def mad(arr):
        """ Median Absolute Deviation: a "Robust" version of standard deviation.
            Indices variability of the sample.
            https://en.wikipedia.org/wiki/Median_absolute_deviation
        """
        return np.median(np.abs(arr - np.median(arr)))

    stats = {}

    # number of observations in the Series
    stats["num_rows_total"] = len(series)

    # number of non-NaN observations in the Series
    stats["num_rows_with_data"] = series.count()

    # distinct count
    value_counts_with_nan = series.value_counts(dropna=False)
    value_counts_without_nan = series.value_counts(dropna=True)
    stats["distinct_count_with_nan"] = value_counts_with_nan.count()
    stats["distinct_count_without_nan"] = value_counts_without_nan.count()

    stats["distinct_count"] = stats["distinct_count_without_nan"]

    # values
    stats["n_values"] = stats["num_rows_with_data"]
    stats["p_values"] = 100 * (stats["num_rows_with_data"] /
                               stats["num_rows_total"])

    # missing
    stats["n_missing"] = stats["num_rows_total"] - stats["num_rows_with_data"]
    stats["p_missing"] = 100 * (
        1 - (stats["num_rows_with_data"] / stats["num_rows_total"]))

    #
    stats["is_unique"] = stats["distinct_count_without_nan"] == stats[
        "num_rows_with_data"]

    #
    values = series.values
    present_values = values[~np.isnan(values)]

    stats["mean"] = np.mean(present_values)
    stats["mode"] = series.mode(
    ).iloc[0] if stats["num_rows_with_data"] > stats[
        "distinct_count_without_nan"] > 1 else series[0]

    stats["std"] = np.std(present_values, ddof=1)
    stats["variance"] = np.var(present_values, ddof=1)
    # Median Absolute Deviation
    stats["mad"] = mad(present_values)

    stats["min"] = np.min(present_values)
    stats["max"] = np.max(present_values)
    stats["range"] = stats["max"] - stats["min"]

    # Unbiased kurtosis obtained using Fisher's definition (kurtosis of normal == 0.0). Normalized by N-1.
    stats["kurtosis"] = series.kurt()
    # Unbiased skew normalized by N-1
    stats["skewness"] = series.skew()

    # zeros
    stats["n_zeros"] = (stats["num_rows_with_data"] -
                        np.count_nonzero(present_values))
    stats["p_zeros"] = 100 * (stats["n_zeros"] / stats["num_rows_total"])

    # quantiles
    quantiles = [.05, .25, .5, .75, .95]
    stats.update({
        f"{percentile:.0%}": value
        for percentile, value in series.quantile(quantiles).to_dict().items()
    })
    stats["iqr"] = stats["75%"] - stats["25%"]

    # outliers
    stats["n_outlier_top"] = len(
        series[series > (stats["75%"] + 1.5 * stats["iqr"])])
    stats["n_outlier_bottom"] = len(
        series[series < (stats["25%"] - 1.5 * stats["iqr"])])
    stats["n_outlier"] = stats["n_outlier_top"] + stats["n_outlier_bottom"]

    stats["p_outlier_top"] = 100 * (stats["n_outlier_top"] /
                                    stats["num_rows_total"])
    stats["p_outlier_bottom"] = 100 * (stats["n_outlier_bottom"] /
                                       stats["num_rows_total"])
    stats["p_outlier"] = 100 * (stats["n_outlier"] / stats["num_rows_total"])

    for key, value in stats.items():
        try:
            stats[key] = round(value, 2)
        except:
            pass
    return stats