Exemplo n.º 1
0
def makeQuantiles(series, n):
    """
    Compute quantiles of input series.

    Parameters
    ----------
    series: Series
        Must have 'order' method and index
    n: int
        Number of quantile buckets

    Returns
    -------
    (edges, quantiles)
       edges: ith bucket --> (left edge, right edge)
       quantiles: ith bucket --> set of values
    """
    series = remove_na(series).copy()
    series = series.order()
    quantiles = {}
    edges = {}
    T = float(len(series))
    inc = T / n
    for i in range(n):
        theSlice = series[inc*i:(i+1)*inc]
        quantiles[i+1] = theSlice
        edges[i+1] = theSlice[0], theSlice[-1]
    return edges, quantiles
Exemplo n.º 2
0
def makeQuantiles(series, n):
    """
    Compute quantiles of input series.

    Parameters
    ----------
    series: Series
        Must have 'order' method and index
    n: int
        Number of quantile buckets

    Returns
    -------
    (edges, quantiles)
       edges: ith bucket --> (left edge, right edge)
       quantiles: ith bucket --> set of values
    """
    series = remove_na(series).copy()
    series = series.order()
    quantiles = {}
    edges = {}
    T = float(len(series))
    inc = T / n
    for i in range(n):
        theSlice = series[inc * i:(i + 1) * inc]
        quantiles[i + 1] = theSlice
        edges[i + 1] = theSlice[0], theSlice[-1]
    return edges, quantiles
Exemplo n.º 3
0
def percentileRank(frame, column=None, kind='mean'):
    """
    Return score at percentile for each point in time (cross-section)

    Parameters
    ----------
    frame: DataFrame
    column: string or Series, optional
       Column name or specific Series to compute percentiles for.
       If not provided, percentiles are computed for all values at each
       point in time. Note that this can take a LONG time.
    kind: {'rank', 'weak', 'strict', 'mean'}, optional
        This optional parameter specifies the interpretation of the
        resulting score:

        - "rank": Average percentage ranking of score.  In case of
                  multiple matches, average the percentage rankings of
                  all matching scores.
        - "weak": This kind corresponds to the definition of a cumulative
                  distribution function.  A percentileofscore of 80%
                  means that 80% of values are less than or equal
                  to the provided score.
        - "strict": Similar to "weak", except that only values that are
                    strictly less than the given score are counted.
        - "mean": The average of the "weak" and "strict" scores, often used in
                  testing.  See

                  http://en.wikipedia.org/wiki/Percentile_rank

    See also
    --------
    scipy.stats.percentileofscore

    Returns
    -------
    TimeSeries or DataFrame, depending on input
    """
    from scipy.stats import percentileofscore
    fun = lambda xs, score: percentileofscore(remove_na(xs),
                                              score, kind=kind)

    results = {}
    framet = frame.T
    if column is not None:
        if isinstance(column, Series):
            for date, xs in frame.T.iteritems():
                results[date] = fun(xs, column.get(date, NaN))
        else:
            for date, xs in frame.T.iteritems():
                results[date] = fun(xs, xs[column])
        results = Series(results)
    else:
        for column in frame.columns:
            for date, xs in framet.iteritems():
                results.setdefault(date, {})[column] = fun(xs, xs[column])
        results = DataFrame(results).T
    return results
Exemplo n.º 4
0
 def plot_group(grouped, ax):
     keys, values = zip(*grouped)
     keys = [_stringify(x) for x in keys]
     values = [remove_na(v) for v in values]
     ax.boxplot(values, **kwds)
     if kwds.get('vert', 1):
         ax.set_xticklabels(keys, rotation=rot, fontsize=fontsize)
     else:
         ax.set_yticklabels(keys, rotation=rot, fontsize=fontsize)
Exemplo n.º 5
0
def percentileRank(frame, column=None, kind='mean'):
    """
    Return score at percentile for each point in time (cross-section)

    Parameters
    ----------
    frame: DataFrame
    column: string or Series, optional
       Column name or specific Series to compute percentiles for.
       If not provided, percentiles are computed for all values at each
       point in time. Note that this can take a LONG time.
    kind: {'rank', 'weak', 'strict', 'mean'}, optional
        This optional parameter specifies the interpretation of the
        resulting score:

        - "rank": Average percentage ranking of score.  In case of
                  multiple matches, average the percentage rankings of
                  all matching scores.
        - "weak": This kind corresponds to the definition of a cumulative
                  distribution function.  A percentileofscore of 80%
                  means that 80% of values are less than or equal
                  to the provided score.
        - "strict": Similar to "weak", except that only values that are
                    strictly less than the given score are counted.
        - "mean": The average of the "weak" and "strict" scores, often used in
                  testing.  See

                  http://en.wikipedia.org/wiki/Percentile_rank

    See also
    --------
    scipy.stats.percentileofscore

    Returns
    -------
    TimeSeries or DataFrame, depending on input
    """
    from scipy.stats import percentileofscore
    fun = lambda xs, score: percentileofscore(remove_na(xs), score, kind=kind)

    results = {}
    framet = frame.T
    if column is not None:
        if isinstance(column, Series):
            for date, xs in frame.T.iteritems():
                results[date] = fun(xs, column.get(date, NaN))
        else:
            for date, xs in frame.T.iteritems():
                results[date] = fun(xs, xs[column])
        results = Series(results)
    else:
        for column in frame.columns:
            for date, xs in framet.iteritems():
                results.setdefault(date, {})[column] = fun(xs, xs[column])
        results = DataFrame(results).T
    return results
Exemplo n.º 6
0
    def boxplot(self, df, axis=0, secondary_y=False, *args, **kwargs):
        if axis == 1:
            df = df.T
        index = df.columns 
        self.set_index(index)
        clean_values = [remove_na(x) for x in df.values.T]

        ax = self.ax
        if secondary_y: 
            ax = self.get_right_ax()

        # positions need to start at 0 to align with DateLocator
        ax.boxplot(clean_values, positions=np.arange(len(index)))
        self.setup_datetime(index)
        self.set_formatter()
Exemplo n.º 7
0
    def boxplot(self, df, axis=0, secondary_y=False, *args, **kwargs):
        """
            Currently supports plotting DataFrames.

            Downside is that this only works for data that has equal columns. 
            For something like plotting groups with varying sizes, you'd
            need to use boxplot(list()). Example is creating a SeriesGroupBy.boxplot
        """
        if axis == 1:
            df = df.T
        index = df.columns 
        self.set_index(index)
        clean_values = [remove_na(x) for x in df.values.T]

        ax = self.find_ax(secondary_y, kwargs)

        # positions need to start at 0 to align with TimestampLocator
        ax.boxplot(clean_values, positions=np.arange(len(index)))
        self.setup_datetime(index)
        self.set_formatter()
Exemplo n.º 8
0
    def boxplot(self, df, axis=0, secondary_y=False, *args, **kwargs):
        """
            Currently supports plotting DataFrames.

            Downside is that this only works for data that has equal columns. 
            For something like plotting groups with varying sizes, you'd
            need to use boxplot(list()). Example is creating a SeriesGroupBy.boxplot
        """
        if axis == 1:
            df = df.T
        index = df.columns
        self.set_index(index)
        clean_values = [remove_na(x) for x in df.values.T]

        ax = self.find_ax(secondary_y, kwargs)

        # positions need to start at 0 to align with TimestampLocator
        ax.boxplot(clean_values, positions=np.arange(len(index)))
        self.setup_datetime(index)
        self.set_formatter()
Exemplo n.º 9
0
 def skipna_wrapper(x):
     nona = remove_na(x)
     if len(nona) == 0:
         return np.nan
     return alternative(nona)
Exemplo n.º 10
0
 def skipna_wrapper(x):
     nona = remove_na(x)
     if len(nona) == 0:
         return np.nan
     return alternative(nona)
Exemplo n.º 11
0
 def test_remove_na_deprecation(self):
     # see gh-16971
     with tm.assert_produces_warning(FutureWarning):
         remove_na(Series([]))
Exemplo n.º 12
0
 def test_remove_na_deprecation(self):
     # see gh-16971
     with tm.assert_produces_warning(FutureWarning):
         remove_na(Series([]))
Exemplo n.º 13
0
def boxplot(data, column=None, by=None, ax=None, fontsize=None,
            rot=0, grid=True, figsize=None, **kwds):
    """
    Make a box plot from DataFrame column optionally grouped b ysome columns or
    other inputs

    Parameters
    ----------
    data : DataFrame or Series
    column : column name or list of names, or vector
        Can be any valid input to groupby
    by : string or sequence
        Column in the DataFrame to group by
    fontsize : int or string
    rot : label rotation angle
    kwds : other plotting keyword arguments to be passed to matplotlib boxplot
           function

    Returns
    -------
    ax : matplotlib.axes.AxesSubplot
    """
    from pandas import Series, DataFrame
    if isinstance(data, Series):
        data = DataFrame({'x' : data})
        column = 'x'

    def plot_group(grouped, ax):
        keys, values = zip(*grouped)
        keys = [_stringify(x) for x in keys]
        values = [remove_na(v) for v in values]
        ax.boxplot(values, **kwds)
        if kwds.get('vert', 1):
            ax.set_xticklabels(keys, rotation=rot, fontsize=fontsize)
        else:
            ax.set_yticklabels(keys, rotation=rot, fontsize=fontsize)

    if column == None:
        columns = None
    else:
        if isinstance(column, (list, tuple)):
            columns = column
        else:
            columns = [column]

    if by is not None:
        if not isinstance(by, (list, tuple)):
            by = [by]

        fig, axes = _grouped_plot_by_column(plot_group, data, columns=columns,
                                            by=by, grid=grid, figsize=figsize)

        # Return axes in multiplot case, maybe revisit later # 985
        ret = axes
    else:
        if ax is None:
            ax = _gca()
        fig = ax.get_figure()
        data = data._get_numeric_data()
        if columns:
            cols = columns
        else:
            cols = data.columns
        keys = [_stringify(x) for x in cols]

        # Return boxplot dict in single plot case

        clean_values = [remove_na(x) for x in data[cols].values.T]
        bp = ax.boxplot(clean_values, **kwds)
        if kwds.get('vert', 1):
            ax.set_xticklabels(keys, rotation=rot, fontsize=fontsize)
        else:
            ax.set_yticklabels(keys, rotation=rot, fontsize=fontsize)
        ax.grid(grid)

        ret = bp

    fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2)
    return ret