Пример #1
0
def downsample(self, freq, closed=None, label=None, axis=0, drop_empty=True):
    """
        Essentially use resample logic but reutrning the groupby object
    """
        
    # default closed/label on offset
    defaults = _offset_defaults(freq)
    if closed is None:
        closed = defaults['closed']

    if label is None:
        label = defaults['label']
    tg = TimeGrouper(freq, closed=closed, label=label, axis=axis)
    grouper = tg.get_grouper(self)

    
    # drop empty groups. this is when we have irregular data that
    # we just want to group into Daily without creating empty days.
    if drop_empty:
        bins = [0] # start with 0 for np.diff
        bins.extend(grouper.bins)
        bins = np.array(bins)
        periods_in_bin = np.diff(bins)
        empty = periods_in_bin == 0

        binlabels = grouper.binlabels

        # skip the 0 we added
        bins = bins[1:][~empty]
        binlabels = binlabels[~empty]
        grouper = BinGrouper(bins, binlabels)

    return self.groupby(grouper, axis=axis)
Пример #2
0
    def resample(self, rule, how='mean', axis=0,
                 fill_method=None, closed='right', label='right',
                 convention=None, kind=None, loffset=None, limit=None):
        """
        Convenience method for frequency conversion and resampling of regular
        time-series data.

        Parameters
        ----------
        rule : the offset string or object representing target conversion
        how : string, method for down- or re-sampling, default 'mean'
        fill_method : string, fill_method for upsampling, default None
        axis : int, optional, default 0
        closed : {'right', 'left'}, default 'right'
            Which side of bin interval is closed
        label : {'right', 'left'}, default 'right'
            Which bin edge label to label bucket with
        convention : {'start', 'end', 's', 'e'}
        loffset : timedelta
            Adjust the resampled time labels
        """
        from pandas.tseries.resample import TimeGrouper
        sampler = TimeGrouper(rule, label=label, closed=closed, how=how,
                              axis=axis, kind=kind, loffset=loffset,
                              fill_method=fill_method, convention=convention,
                              limit=limit)
        return sampler.resample(self)
Пример #3
0
    def resample(self, rule, how=None, axis=0, fill_method=None,
                 closed='right', label='right', convention=None,
                 kind=None, loffset=None, limit=None, base=0):
        """
        Convenience method for frequency conversion and resampling of regular
        time-series data.

        Parameters
        ----------
        rule : the offset string or object representing target conversion
        how : string, method for down- or re-sampling, default to 'mean' for
              downsampling
        fill_method : string, fill_method for upsampling, default None
        axis : int, optional, default 0
        closed : {'right', 'left'}, default 'right'
            Which side of bin interval is closed
        label : {'right', 'left'}, default 'right'
            Which bin edge label to label bucket with
        convention : {'start', 'end', 's', 'e'}
        loffset : timedelta
            Adjust the resampled time labels
        base : int, default 0
            For frequencies that evenly subdivide 1 day, the "origin" of the
            aggregated intervals. For example, for '5min' frequency, base could
            range from 0 through 4. Defaults to 0
        """
        from pandas.tseries.resample import TimeGrouper
        sampler = TimeGrouper(rule, label=label, closed=closed, how=how,
                              axis=axis, kind=kind, loffset=loffset,
                              fill_method=fill_method, convention=convention,
                              limit=limit, base=base)
        return sampler.resample(self)
Пример #4
0
    def test_panel_aggregation(self):
        ind = pd.date_range("1/1/2000", periods=100)
        data = np.random.randn(2, len(ind), 4)
        wp = pd.Panel(data, items=["Item1", "Item2"], major_axis=ind, minor_axis=["A", "B", "C", "D"])

        tg = TimeGrouper("M", axis=1)
        _, grouper, _ = tg.get_grouper(wp)
        bingrouped = wp.groupby(grouper)
        binagg = bingrouped.mean()

        def f(x):
            assert isinstance(x, Panel)
            return x.mean(1)

        result = bingrouped.agg(f)
        tm.assert_panel_equal(result, binagg)
Пример #5
0
    def test_panel_aggregation(self):
        ind = pd.date_range('1/1/2000', periods=100)
        data = np.random.randn(2, len(ind), 4)
        wp = pd.Panel(data, items=['Item1', 'Item2'], major_axis=ind,
                      minor_axis=['A', 'B', 'C', 'D'])

        tg = TimeGrouper('M', axis=1)
        grouper = tg.get_grouper(wp)
        bingrouped = wp.groupby(grouper)
        binagg = bingrouped.mean()

        def f(x):
            assert(isinstance(x, Panel))
            return x.mean(1)
        result = bingrouped.agg(f)
        tm.assert_panel_equal(result, binagg)
Пример #6
0
    def test_apply_iteration(self):
        # #2300
        N = 1000
        ind = pd.date_range(start="2000-01-01", freq="D", periods=N)
        df = DataFrame({'open': 1, 'close': 2}, index=ind)
        tg = TimeGrouper('M')

        _, grouper, _ = tg._get_grouper(df)

        # Errors
        grouped = df.groupby(grouper, group_keys=False)
        f = lambda df: df['close'] / df['open']

        # it works!
        result = grouped.apply(f)
        self.assertTrue(result.index.equals(df.index))
Пример #7
0
    def test_apply_iteration(self):
        # #2300
        N = 1000
        ind = pd.date_range(start="2000-01-01", freq="D", periods=N)
        df = DataFrame({'open': 1, 'close': 2}, index=ind)
        tg = TimeGrouper('M')

        grouper = tg.get_grouper(df)

        # Errors

        grouped = df.groupby(grouper, group_keys=False)
        f = lambda df: df['close'] / df['open']

        # it works!
        result = grouped.apply(f)
        self.assertTrue(result.index.equals(df.index))
Пример #8
0
    def test_panelgroupby(self):
        def agg_func(pan):
            assert isinstance(pan, pd.Panel)
            return pan.mean()

        ind = pd.date_range('1/1/2000', periods=100)
        data = np.random.randn(2,len(ind),4)
        wp = pd.Panel(data, items=['Item1', 'Item2'], major_axis=ind, minor_axis=['A', 'B', 'C', 'D'])

        from pandas.tseries.resample import TimeGrouper
        #timegrouper
        tg = TimeGrouper('M', axis=1)
        grouper = tg.get_grouper(wp)
        bingrouped = wp.groupby(grouper)
        # Failed 12-15-12
        # https://github.com/pydata/pandas/issues/2537
        bingrouped.agg(agg_func)
Пример #9
0
    def test_panel_aggregation(self):
        ind = pd.date_range('1/1/2000', periods=100)
        data = np.random.randn(2, len(ind), 4)
        wp = pd.Panel(data,
                      items=['Item1', 'Item2'],
                      major_axis=ind,
                      minor_axis=['A', 'B', 'C', 'D'])

        tg = TimeGrouper('M', axis=1)
        _, grouper, _ = tg._get_grouper(wp)
        bingrouped = wp.groupby(grouper)
        binagg = bingrouped.mean()

        def f(x):
            assert (isinstance(x, Panel))
            return x.mean(1)

        result = bingrouped.agg(f)
        tm.assert_panel_equal(result, binagg)
Пример #10
0
    def test_custom_grouper(self):

        dti = DatetimeIndex(freq='Min',
                            start=datetime(2005, 1, 1),
                            end=datetime(2005, 1, 10))

        s = Series(np.array([1] * len(dti)), index=dti, dtype='int64')

        b = TimeGrouper(Minute(5))
        g = s.groupby(b)

        # check all cython functions work
        funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var']
        for f in funcs:
            g._cython_agg_general(f)

        b = TimeGrouper(Minute(5), closed='right', label='right')
        g = s.groupby(b)
        # check all cython functions work
        funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var']
        for f in funcs:
            g._cython_agg_general(f)

        self.assertEquals(g.ngroups, 2593)
        self.assert_(notnull(g.mean()).all())

        # construct expected val
        arr = [1] + [5] * 2592
        idx = dti[0:-1:5]
        idx = idx.append(dti[-1:])
        expect = Series(arr, index=idx)

        # GH2763 - return in put dtype if we can
        result = g.agg(np.sum)
        assert_series_equal(result, expect)

        df = DataFrame(np.random.rand(len(dti), 10),
                       index=dti,
                       dtype='float64')
        r = df.groupby(b).agg(np.sum)

        self.assertEquals(len(r.columns), 10)
        self.assertEquals(len(r.index), 2593)
Пример #11
0
    def test_count(self):
        self.ts[::3] = np.nan

        grouper = TimeGrouper('A', label='right', closed='right')
        result = self.ts.resample('A', how='count')

        expected = self.ts.groupby(lambda x: x.year).count()
        expected.index = result.index

        assert_series_equal(result, expected)
Пример #12
0
def downsample(self, freq, closed=None, label=None, axis=0):
    """
        Essentially use resample logic but reutrning the groupby object
    """
        
    # default closed/label on offset
    defaults = _offset_defaults(freq)
    if closed is None:
        closed = defaults['closed']

    if label is None:
        label = defaults['label']
    tg = TimeGrouper(freq, closed=closed, label=label, axis=axis)
    grouper = tg.get_grouper(self)

    # TODO Get rid of empty bins? 
    #bins = [0] 
    #bins.extend(grouper.bins)
    #periods_in_bin = np.diff(bins)

    return self.groupby(grouper, axis=axis)
Пример #13
0
    def test_apply(self):
        grouper = TimeGrouper('A', label='right', closed='right')

        grouped = self.ts.groupby(grouper)

        f = lambda x: x.order()[-3:]

        applied = grouped.apply(f)
        expected = self.ts.groupby(lambda x: x.year).apply(f)

        applied.index = applied.index.droplevel(0)
        expected.index = expected.index.droplevel(0)
        assert_series_equal(applied, expected)
Пример #14
0
 def test_fails_on_no_datetime_index(self):
     index_names = ('Int64Index', 'PeriodIndex', 'Index', 'Float64Index',
                    'MultiIndex')
     index_funcs = (tm.makeIntIndex, tm.makePeriodIndex,
                    tm.makeUnicodeIndex, tm.makeFloatIndex,
                    lambda m: tm.makeCustomIndex(m, 2))
     n = 2
     for name, func in zip(index_names, index_funcs):
         index = func(n)
         df = DataFrame({'a': np.random.randn(n)}, index=index)
         with tm.assertRaisesRegexp(
                 TypeError, "axis must be a DatetimeIndex, "
                 "but got an instance of %r" % name):
             df.groupby(TimeGrouper('D'))
Пример #15
0
    def resample(self, rule, how=None, axis=0, fill_method=None,
                 closed=None, label=None, convention='start',
                 kind=None, loffset=None, limit=None, base=0):
        """
        Convenience method for frequency conversion and resampling of regular
        time-series data.

        Parameters
        ----------
        rule : the offset string or object representing target conversion
        how : string, method for down- or re-sampling, default to 'mean' for
              downsampling
        axis : int, optional, default 0
        fill_method : string, fill_method for upsampling, default None
        closed : {'right', 'left'}
            Which side of bin interval is closed
        label : {'right', 'left'}
            Which bin edge label to label bucket with
        convention : {'start', 'end', 's', 'e'}
        kind: "period"/"timestamp"
        loffset: timedelta
            Adjust the resampled time labels
        limit: int, default None
            Maximum size gap to when reindexing with fill_method
        base : int, default 0
            For frequencies that evenly subdivide 1 day, the "origin" of the
            aggregated intervals. For example, for '5min' frequency, base could
            range from 0 through 4. Defaults to 0
        """
        from pandas.tseries.resample import TimeGrouper
        axis = self._get_axis_number(axis)
        sampler = TimeGrouper(rule, label=label, closed=closed, how=how,
                              axis=axis, kind=kind, loffset=loffset,
                              fill_method=fill_method, convention=convention,
                              limit=limit, base=base)
        return sampler.resample(self)
def aggregate_data(df, timescale, method):
    """Aggregate data to given timescale."""

    assert timescale in ['monthly', 'seasonal']
    assert method in ['sum', 'mean']
    timescale_dict = {'monthly': '1M', 'seasonal': '3M'}

    aggregated_data = df.groupby(
        TimeGrouper(freq=timescale_dict[timescale], closed='left'))
    if method == 'sum':
        aggregated_data = aggregated_data.sum()
    elif method == 'mean':
        aggregated_data = aggregated_data.mean()

    aggregated_data = aggregated_data.drop(aggregated_data.index[-1])

    return aggregated_data
Пример #17
0
    def test_resample_frame_basic(self):
        df = tm.makeTimeDataFrame()

        b = TimeGrouper('M')
        g = df.groupby(b)

        # check all cython functions work
        funcs = ['add', 'mean', 'prod', 'min', 'max', 'var']
        for f in funcs:
            g._cython_agg_general(f)

        result = df.resample('A')
        assert_series_equal(result['A'], df['A'].resample('A'))

        result = df.resample('M')
        assert_series_equal(result['A'], df['A'].resample('M'))

        df.resample('M', kind='period')
        df.resample('W-WED', kind='period')
Пример #18
0
    def test_panelgroupby(self):
        def agg_func(pan):
            assert isinstance(pan, pd.Panel)
            return pan.mean()

        ind = pd.date_range('1/1/2000', periods=100)
        data = np.random.randn(2, len(ind), 4)
        wp = pd.Panel(data,
                      items=['Item1', 'Item2'],
                      major_axis=ind,
                      minor_axis=['A', 'B', 'C', 'D'])

        from pandas.tseries.resample import TimeGrouper
        #timegrouper
        tg = TimeGrouper('M', axis=1)
        bingrouped = wp.groupby(tg)
        # Failed 12-15-12
        # https://github.com/pydata/pandas/issues/2537
        bingrouped.agg(agg_func)
Пример #19
0
    def test_resample_basic(self):
        rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min',
                         name='index')
        s = Series(np.random.randn(14), index=rng)
        result = s.resample('5min', how='mean', closed='right', label='right')
        expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()],
                          index=date_range('1/1/2000', periods=4, freq='5min'))
        assert_series_equal(result, expected)
        self.assertEqual(result.index.name, 'index')

        result = s.resample('5min', how='mean', closed='left', label='right')
        expected = Series([s[:5].mean(), s[5:10].mean(), s[10:].mean()],
                          index=date_range('1/1/2000 00:05', periods=3,
                                           freq='5min'))
        assert_series_equal(result, expected)

        s = self.series
        result = s.resample('5Min', how='last')
        grouper = TimeGrouper(Minute(5), closed='left', label='left')
        expect = s.groupby(grouper).agg(lambda x: x[-1])
        assert_series_equal(result, expect)
Пример #20
0
    def test_resample_ohlc(self):
        s = self.series

        grouper = TimeGrouper(Minute(5))
        expect = s.groupby(grouper).agg(lambda x: x[-1])
        result = s.resample('5Min', how='ohlc')

        self.assertEquals(len(result), len(expect))
        self.assertEquals(len(result.columns), 4)

        xs = result.irow(-2)
        self.assertEquals(xs['open'], s[-6])
        self.assertEquals(xs['high'], s[-6:-1].max())
        self.assertEquals(xs['low'], s[-6:-1].min())
        self.assertEquals(xs['close'], s[-2])

        xs = result.irow(0)
        self.assertEquals(xs['open'], s[0])
        self.assertEquals(xs['high'], s[:5].max())
        self.assertEquals(xs['low'], s[:5].min())
        self.assertEquals(xs['close'], s[4])
Пример #21
0
    def test_resample_ohlc(self):
        s = self.series

        grouper = TimeGrouper(Minute(5), closed='right', label='right')
        expect = s.groupby(grouper).agg(lambda x: x[-1])
        result = s.resample('5Min', how='ohlc')

        self.assertEquals(len(result), len(expect))
        self.assertEquals(len(result.columns), 4)

        xs = result.irow(-1)
        self.assertEquals(xs['open'], s[-5])
        self.assertEquals(xs['high'], s[-5:].max())
        self.assertEquals(xs['low'], s[-5:].min())
        self.assertEquals(xs['close'], s[-1])

        xs = result.irow(1)
        self.assertEquals(xs['open'], s[1])
        self.assertEquals(xs['high'], s[1:6].max())
        self.assertEquals(xs['low'], s[1:6].min())
        self.assertEquals(xs['close'], s[5])
Пример #22
0
def sensorAggregate(oat,
                    aggregation='mean',
                    frequency='D',
                    qilist=None,
                    min_obs=None,
                    nan_data=np.nan,
                    nan_qi=0,
                    closed='left',
                    label='left',
                    column_name=None):
    """
    Aggregate OAT.sensor according specified parameters

        Args:
            oat (OAT.sensor): OAT.sensor object to be aggregated
            aggregation (str): specific aggregation options:
                               {'max', 'min', 'mean', 'count'}, default 'mean'
            qilist (list): list of quality Index values to select observations
                           used in aggregation
            min_obs (float): minumum number of non null values recorded in the
                             period to calculate the aggregation (note that
                             this percentage includes only valid
                             qualityIndexed measures)
            nan_data (float): value to assign in aggregation when no or
                              insufficient data are available
            nan_qi (int): value to assign in aggregation when no or
                          insufficient data are available,
            closed (str): which side of bin interval is closed:
                          {‘right’, ‘left’}, default 'left'
            label (str): which bin edge label to label bucket with:
                         {‘right’, ‘left’}, default 'left'
    """

    try:
        aggregations = {'num': 'count', 'data': aggregation, 'quality': 'min'}
        toat = oat.copy()

        toat.ts['num'] = 1

        if qilist:
            toat.ts = toat.ts[(toat.ts['quality'].isin(qilist)
                               & toat.ts['quality'].notnull())].groupby(
                                   TimeGrouper(freq=frequency,
                                               closed=closed,
                                               label=label)).agg(aggregations)
        else:
            toat.ts = toat.ts.dropna(how='any').groupby(
                TimeGrouper(freq=frequency, closed=closed,
                            label=label)).agg(aggregations)
        toat_values = list(toat.ts.columns.values)

        if min_obs:
            if (toat.ts['num'][0] < min_obs):
                # assign null to non satisfactory
                # toat_values[toat_values.index("num")] = 0
                # toat_values[toat_values.index("data")] = nan_data
                # toat_values[toat_values.index("quality")] = nan_qi
                # toat.ts[toat.ts['num'] < min_obs] = toat_values
                raise Exception(
                    ("The aggregation does not satisfy the minimum" +
                     " number of observations [%s]") % (min_obs))
    except Exception as e:
        raise e
    else:

        # extract only data & quality
        toat.ts = toat.ts[['data', 'quality']]
        if column_name:
            toat.ts.rename(inplace=True, columns={'data': column_name})
        toat.freq = frequency

        return toat
Пример #23
0
def downsample(self, freq, closed='right', label='right', axis=0):
    tg = TimeGrouper(freq, closed=closed, label=label)
    grouper = tg.get_grouper(self)
    return self.groupby(grouper, axis=axis)
Пример #24
0
def downsample(self, freq, closed='right', label='right', axis=0):
    tg = TimeGrouper(freq, closed=closed, label=label)
    grouper = tg.get_grouper(self)
    return self.groupby(grouper, axis=axis)
Пример #25
0
    def test_resample_basic(self):
        rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min',
                         name='index')
        s = Series(np.random.randn(14), index=rng)
        result = s.resample('5min', how='mean', closed='right', label='right')
        expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()],
                          index=date_range('1/1/2000', periods=4, freq='5min'))
        assert_series_equal(result, expected)
        self.assert_(result.index.name == 'index')

        result = s.resample('5min', how='mean', closed='left', label='right')
        expected = Series([s[:5].mean(), s[5:10].mean(), s[10:].mean()],
                          index=date_range('1/1/2000 00:05', periods=3,
                                           freq='5min'))
        assert_series_equal(result, expected)

        s = self.series
        result = s.resample('5Min', how='last')
        grouper = TimeGrouper(Minute(5), closed='right', label='right')
        expect = s.groupby(grouper).agg(lambda x: x[-1])
        assert_series_equal(result, expect)

        # from daily
        dti = DatetimeIndex(start=datetime(2005,1,1), end=datetime(2005,1,10),
                            freq='D', name='index')

        s = Series(np.random.rand(len(dti)), dti)

        # to weekly
        result = s.resample('w-sun', how='last')

        self.assertEquals(len(result), 3)
        self.assert_((result.index.dayofweek == [6,6,6]).all())
        self.assertEquals(result.irow(0), s['1/2/2005'])
        self.assertEquals(result.irow(1), s['1/9/2005'])
        self.assertEquals(result.irow(2), s.irow(-1))

        result = s.resample('W-MON', how='last')
        self.assertEquals(len(result), 2)
        self.assert_((result.index.dayofweek == [0,0]).all())
        self.assertEquals(result.irow(0), s['1/3/2005'])
        self.assertEquals(result.irow(1), s['1/10/2005'])

        result = s.resample('W-TUE', how='last')
        self.assertEquals(len(result), 2)
        self.assert_((result.index.dayofweek == [1,1]).all())
        self.assertEquals(result.irow(0), s['1/4/2005'])
        self.assertEquals(result.irow(1), s['1/10/2005'])

        result = s.resample('W-WED', how='last')
        self.assertEquals(len(result), 2)
        self.assert_((result.index.dayofweek == [2,2]).all())
        self.assertEquals(result.irow(0), s['1/5/2005'])
        self.assertEquals(result.irow(1), s['1/10/2005'])

        result = s.resample('W-THU', how='last')
        self.assertEquals(len(result), 2)
        self.assert_((result.index.dayofweek == [3,3]).all())
        self.assertEquals(result.irow(0), s['1/6/2005'])
        self.assertEquals(result.irow(1), s['1/10/2005'])

        result = s.resample('W-FRI', how='last')
        self.assertEquals(len(result), 2)
        self.assert_((result.index.dayofweek == [4,4]).all())
        self.assertEquals(result.irow(0), s['1/7/2005'])
        self.assertEquals(result.irow(1), s['1/10/2005'])

        # to biz day
        result = s.resample('B', how='last')
        self.assertEquals(len(result), 6)
        self.assert_((result.index.dayofweek == [0,1,2,3,4,0]).all())
        self.assertEquals(result.irow(0), s['1/3/2005'])
        self.assertEquals(result.irow(1), s['1/4/2005'])
        self.assertEquals(result.irow(5), s['1/10/2005'])
        self.assert_(result.index.name == 'index')
Пример #26
0
def sensorStats(oat,
                stat='mean',
                frequency='D',
                qilist=None,
                min_obs=None,
                nan_data=np.nan,
                nan_qi=0,
                closed='left',
                label='left',
                column_name=None):

    try:
        aggregations = {
            'data': [stat, 'count'],
            'quality': 'min',
        }
        toat = oat.copy()
        if stat == 'mean':
            grouped = toat.ts.dropna(how='any').groupby(
                TimeGrouper(freq=frequency, closed=closed,
                            label=label)).agg(aggregations)
            col_list = ['data', 'count']
            df1 = pd.DataFrame(data=None, columns=col_list)
            for i in grouped:
                df1['data'] = grouped[(u'data', 'mean')]
                df1['count'] = grouped[(u'data', 'count')]
        else:
            grouped = toat.ts.dropna(how='any').groupby(
                TimeGrouper(freq=frequency, closed=closed, label=label))
            col_list = list(toat.ts.columns.values)
            col_list.append(u'time')
            df1 = pd.DataFrame(data=None, columns=col_list)
            for i in grouped:
                df = i[1]
                df.loc[:, u'time'] = df.index
                if not df.empty:
                    if stat == 'max':
                        df1.loc[i[0]] = df.loc[df['data'].idxmax()]
                    else:
                        df1.loc[i[0]] = df.loc[df['data'].idxmin()]
            toat.ts = df1
    except Exception as e:
        raise e
    else:

        # extract only data & quality
        if stat == 'mean':
            toat.ts = df1[['data', 'count']]
        else:
            toat.ts = toat.ts[['data', 'quality', 'time']]
        if column_name:
            if stat == 'mean':
                toat.ts.rename(inplace=True,
                               columns={
                                   'data': column_name,
                                   'count': '{}_COUNT'.format(column_name)
                               })
            else:
                toat.ts.rename(inplace=True,
                               columns={
                                   'data': column_name,
                                   'time': 'TIME_' + column_name
                               })
        toat.freq = frequency

        return toat