Пример #1
0
    def test_daylight_savings(self):
        # 2004 daylight savings switches:
        # Sunday 2004-04-04 and Sunday 2004-10-31

        # make sure there's no weirdness around calculating the next day's
        # session's open time.

        for date in ["2004-04-05", "2004-11-01"]:
            next_day = pd.Timestamp(date, tz='UTC')
            open_date = next_day + Timedelta(days=self.calendar.open_offset)

            the_open = self.calendar.schedule.loc[next_day].market_open

            localized_open = the_open.tz_localize("UTC").tz_convert(
                self.calendar.tz
            )

            self.assertEqual(
                (open_date.year, open_date.month, open_date.day),
                (localized_open.year, localized_open.month, localized_open.day)
            )

            self.assertEqual(
                self.calendar.open_time.hour,
                localized_open.hour
            )

            self.assertEqual(
                self.calendar.open_time.minute,
                localized_open.minute
            )
Пример #2
0
    def test_bar_count_for_simple_transforms(self):
        # July 2015
        # Su Mo Tu We Th Fr Sa
        #           1  2  3  4
        #  5  6  7  8  9 10 11
        # 12 13 14 15 16 17 18
        # 19 20 21 22 23 24 25
        # 26 27 28 29 30 31

        # half an hour into july 9, getting a 4-"day" window should get us
        # all the minutes of 7/6, 7/7, 7/8, and 31 minutes of 7/9

        july_9_dt = self.trading_calendar.open_and_close_for_session(
            pd.Timestamp("2015-07-09", tz='UTC')
        )[0] + Timedelta("30 minutes")

        self.assertEqual(
            (3 * 390) + 31,
            self.data_portal._get_minute_count_for_transform(july_9_dt, 4)
        )

        #    November 2015
        # Su Mo Tu We Th Fr Sa
        #  1  2  3  4  5  6  7
        #  8  9 10 11 12 13 14
        # 15 16 17 18 19 20 21
        # 22 23 24 25 26 27 28
        # 29 30

        # nov 26th closed
        # nov 27th was an early close

        # half an hour into nov 30, getting a 4-"day" window should get us
        # all the minutes of 11/24, 11/25, 11/27 (half day!), and 31 minutes
        # of 11/30
        nov_30_dt = self.trading_calendar.open_and_close_for_session(
            pd.Timestamp("2015-11-30", tz='UTC')
        )[0] + Timedelta("30 minutes")

        self.assertEqual(
            390 + 390 + 210 + 31,
            self.data_portal._get_minute_count_for_transform(nov_30_dt, 4)
        )
Пример #3
0
def split2sessions(df):
    index = 0
    first_date = None
    last_date = None
    last_id = None
    sessions = []
    session = pd.DataFrame(columns=df.columns)
    textual_search = 'search videos. text:'
    start = len(textual_search)
    searched = False
    threshold = Timedelta(minutes=10)  # TODO 30 min

    for idx, row in enumerate(df.iterrows()):
        try:
            id, datetime, geo, user_id, companyid, label, items = row[1].values

            if idx % 1000 == 0:
                print 'passed line #%d in split2sessions, # of search seassions is: %d, for id:%d' % (
                    idx, len(sessions), id)

            if label == 'search - search box' and textual_search in items and len(
                    items) > start + 2:
                searched = True

            # TODO fix time differences type, and check between FIRST interaction and CURRENT
            # if (last_date and last_date and datetime - last_date > threshold) or (last_id and last_id != user_id):
            if (first_date and datetime - first_date > threshold) or (
                    last_id and last_id != user_id):
                if searched:
                    sessions.append(session)
                index = 0
                session = pd.DataFrame(columns=df.columns)
                searched = False
                first_date = None

            session.loc[index] = row[1]
            index += 1
            last_date = datetime
            last_id = user_id
            if not first_date:
                first_date = datetime

        except Exception as ex:
            print ex

    # last
    if len(session) > 0:
        sessions.append(session)

    print '# of sesseions' + str(len(sessions))
    return sessions
Пример #4
0
def matplotlib_locator_formatter(timedelta, span=1):
    """
    Compute appropriate locator and formatter for renderers
    based on matplotlib, depending on designated time span.
    """
    from matplotlib.dates import date_ticker_factory, DateFormatter
    locator, formatter = date_ticker_factory(span)

    # http://pandas.pydata.org/pandas-docs/stable/timedeltas.html
    # https://stackoverflow.com/questions/16103238/pandas-timedelta-in-days
    is_macro = timedelta <= Timedelta(days=1)
    is_supermacro = timedelta <= Timedelta(minutes=5)

    if is_macro:
        #formatter = DateFormatter(fmt='%H:%M:%S.%f')
        formatter = DateFormatter(fmt='%H:%M')

    if is_supermacro:
        formatter = DateFormatter(fmt='%H:%M:%S')

        # Formatter overrides
        #if formatter.fmt == '%H:%M\n%b %d':
        #    formatter = DateFormatter(fmt='%Y-%m-%d %H:%M')

    # Labs
    #from matplotlib.dates import AutoDateLocator, AutoDateFormatter, HOURLY
    #locator = AutoDateLocator(maxticks=7)
    #locator.autoscale()
    #locator.intervald[HOURLY] = [5]
    #formatter = AutoDateFormatter(breaks)
    #formatter = date_format('%Y-%m-%d\n%H:%M')

    # Default building blocks
    #from matplotlib.dates import AutoDateFormatter, AutoDateLocator
    #locator = AutoDateLocator()
    #formatter = AutoDateFormatter(locator)

    return locator, formatter
Пример #5
0
def test_interpolate_gps_time():
    filename = "data/serial-link-20150429-163230.log.json.hdf5"
    assert os.path.isfile(filename)
    with pd.HDFStore(filename) as store:
        idx = store.rover_spp.T.host_offset.reset_index()
        model = t.interpolate_gpst_model(idx)
        assert isinstance(model, pd.stats.ols.OLS)
        assert np.allclose([model.beta.x, model.beta.intercept],
                           [1.00000368376, -64.2579561376])
        init_offset = store.rover_spp.T.host_offset[0]
        init_date = store.rover_spp.T.index[0]
        f = lambda t1: t.apply_gps_time(t1 * t.MSEC_TO_SEC, init_date, model)
        dates = store.rover_logs.T.host_offset.apply(f)
        l = dates.tolist()
        start, end = l[0], l[-1]
        assert start == Timestamp("2015-04-29 23:32:55.272075")
        assert end == Timestamp("2015-04-29 23:57:46.457568")
        init_secs_offset \
          = store.rover_spp.T.host_offset[0] - store.rover_logs.T.index[0]
        assert np.allclose([init_secs_offset * t.MSEC_TO_SEC], [55.859])
        assert (init_date - start) == Timedelta('0 days 00:00:55.848925')
        assert (end - init_date) == Timedelta('0 days 00:23:55.336568')
        assert pd.DatetimeIndex(dates).is_monotonic_increasing
        assert dates.shape == (2457, )
Пример #6
0
def to_offset(freqstr):
    """
    Return DateOffset object from string representation or
    Timedelta object

    Examples
    --------
    >>> to_offset('5Min')
    Minute(5)
    """
    if freqstr is None:
        return None

    if isinstance(freqstr, DateOffset):
        return freqstr

    if isinstance(freqstr, tuple):
        name = freqstr[0]
        stride = freqstr[1]
        if isinstance(stride, compat.string_types):
            name, stride = stride, name
        name, _ = _base_and_stride(name)
        delta = get_offset(name) * stride

    elif isinstance(freqstr, timedelta):
        delta = None
        freqstr = Timedelta(freqstr)
        try:
            for name in freqstr.components._fields:
                offset = _name_to_offset_map[name]
                stride = getattr(freqstr.components, name)
                if stride != 0:
                    offset = stride * offset
                    if delta is None:
                        delta = offset
                    else:
                        delta = delta + offset
        except Exception:
            raise ValueError(_INVALID_FREQ_ERROR.format(freqstr))

    else:
        delta = None
        stride_sign = None
        try:
            for stride, name, _ in opattern.findall(freqstr):
                offset = get_offset(name)
                if stride_sign is None:
                    stride_sign = -1 if stride.startswith('-') else 1
                if not stride:
                    stride = 1
                stride = int(stride)
                offset = offset * int(np.fabs(stride) * stride_sign)
                if delta is None:
                    delta = offset
                else:
                    delta = delta + offset
        except Exception:
            raise ValueError(_INVALID_FREQ_ERROR.format(freqstr))

    if delta is None:
        raise ValueError(_INVALID_FREQ_ERROR.format(freqstr))

    return delta
Пример #7
0
def to_offset(freq):
    """
    Return DateOffset object from string or tuple representation
    or datetime.timedelta object

    Parameters
    ----------
    freq : str, tuple, datetime.timedelta, DateOffset or None

    Returns
    -------
    delta : DateOffset
        None if freq is None

    Raises
    ------
    ValueError
        If freq is an invalid frequency

    See Also
    --------
    pandas.DateOffset

    Examples
    --------
    >>> to_offset('5min')
    <5 * Minutes>

    >>> to_offset('1D1H')
    <25 * Hours>

    >>> to_offset(('W', 2))
    <2 * Weeks: weekday=6>

    >>> to_offset((2, 'B'))
    <2 * BusinessDays>

    >>> to_offset(datetime.timedelta(days=1))
    <Day>

    >>> to_offset(Hour())
    <Hour>
    """
    if freq is None:
        return None

    if isinstance(freq, DateOffset):
        return freq

    if isinstance(freq, tuple):
        name = freq[0]
        stride = freq[1]
        if isinstance(stride, compat.string_types):
            name, stride = stride, name
        name, _ = _base_and_stride(name)
        delta = get_offset(name) * stride

    elif isinstance(freq, timedelta):
        delta = None
        freq = Timedelta(freq)
        try:
            for name in freq.components._fields:
                offset = _name_to_offset_map[name]
                stride = getattr(freq.components, name)
                if stride != 0:
                    offset = stride * offset
                    if delta is None:
                        delta = offset
                    else:
                        delta = delta + offset
        except Exception:
            raise ValueError(_INVALID_FREQ_ERROR.format(freq))

    else:
        delta = None
        stride_sign = None
        try:
            splitted = re.split(opattern, freq)
            if splitted[-1] != '' and not splitted[-1].isspace():
                # the last element must be blank
                raise ValueError('last element must be blank')
            for sep, stride, name in zip(splitted[0::4], splitted[1::4],
                                         splitted[2::4]):
                if sep != '' and not sep.isspace():
                    raise ValueError('separator must be spaces')
                offset = get_offset(name)
                if stride_sign is None:
                    stride_sign = -1 if stride.startswith('-') else 1
                if not stride:
                    stride = 1
                stride = int(stride)
                offset = offset * int(np.fabs(stride) * stride_sign)
                if delta is None:
                    delta = offset
                else:
                    delta = delta + offset
        except Exception:
            raise ValueError(_INVALID_FREQ_ERROR.format(freq))

    if delta is None:
        raise ValueError(_INVALID_FREQ_ERROR.format(freq))

    return delta