Пример #1
0
    def start(
        self,
        dirs,
        output_dir,
        analysis_start_date,
        analysis_end_date,
        analysis_timespan,
        cell_execution_timeout,
        make_configs,
        backend,
    ):
        '''
        Initiate new project.
        No files will be touched!

        Parameters
        ----------
        dirs: list, optional
            List of sub-directory names that should be used in the project.
            By default all subdirectories defined in the contructor are taken into account.
        '''

        # set ouput_dir
        self.output_dir = output_dir
        if self.output_dir is None:
            self.output_dir = os.path.join('.', self.project_name)

        # set backend binary format to read/write dataframes
        self.backend = backend

        # analsysis timespan
        self.analysis_timespan = analysis_timespan
        if not isinstance(self.analysis_timespan, pd.Timedelta):
            try:
                self.analysis_timespan = pd.Timedelta(self.analysis_timespan)
            except Exception as e:
                logging.error(e)

        # analysis start date
        self.analysis_start_date = analysis_start_date
        if self.analysis_start_date is None:
            self.analysis_start_date = pd.datetime.today(
            ) - self.analysis_timespan

        # analysis end date
        # defaults to today
        self.analysis_end_date = analysis_end_date
        if self.analysis_end_date is None:
            self.analysis_end_date = pd.datetime.today()

        # re-calculate timespan as it might be wrong due to overwritten start or end date
        self.analysis_timespan = self.analysis_end_date - self.analysis_start_date

        # set the exec timeout of a single cell for notebooks execution
        self.cell_execution_timeout = cell_execution_timeout

        # set make_configs
        self.make_configs = make_configs

        # dict ot store successful execution dates
        self.execution_dates_make_configs = {}

        # init working directories
        for sub_dir in dirs:
            self.__dict__[sub_dir] = RdsFs(
                os.path.join(self.output_dir, sub_dir),
                nof_processes=self.nof_processes,
                backend=self.backend,
            )

        # save project properties in defs
        self.__kwargs2defs()

        logging.info('Project "%s" created' % self.project_name)
        self._status('started')
        self.save()
Пример #2
0
def _get_good_sections(df, sample_period):
    """
    Code copied from nilmtk[1]/nilmtk/stats/goodsections.py
    
    [1] https://github.com/nilmtk/nilmtk/
    """
    index = df.dropna().sort_index().index
    df_time_end = df.index[-1] + pd.Timedelta(seconds=sample_period)
    del df

    if len(index) < 2:
        return []

    timedeltas_sec = timedelta64_to_secs(np.diff(index.values))
    timedeltas_check = timedeltas_sec <= sample_period

    # Memory management
    del timedeltas_sec
    gc.collect()

    timedeltas_check = np.concatenate([[False], timedeltas_check])
    transitions = np.diff(timedeltas_check.astype(np.int))

    # Memory management
    last_timedeltas_check = timedeltas_check[-1]
    del timedeltas_check
    gc.collect()

    good_sect_starts = list(index[:-1][transitions == 1])
    good_sect_ends = list(index[:-1][transitions == -1])

    # Memory management
    last_index = index[-1]
    del index
    gc.collect()

    # Work out if this chunk ends with an open ended good section
    if len(good_sect_ends) == 0:
        ends_with_open_ended_good_section = (len(good_sect_starts) > 0)
    elif len(good_sect_starts) > 0:
        # We have good_sect_ends and good_sect_starts
        ends_with_open_ended_good_section = (good_sect_ends[-1] <
                                             good_sect_starts[-1])
    else:
        # We have good_sect_ends but no good_sect_starts
        ends_with_open_ended_good_section = False

    if ends_with_open_ended_good_section:
        good_sect_ends += [df_time_end]

    assert len(good_sect_starts) == len(good_sect_ends)

    sections = [
        TimeFrame(start, end)
        for start, end in zip(good_sect_starts, good_sect_ends)
        if not (start == end and start is not None)
    ]

    # Memory management
    del good_sect_starts
    del good_sect_ends
    gc.collect()

    return sections
Пример #3
0
def getbc_intraday(symbol,
                   start=None,
                   end=None,
                   minutes=5,
                   showUrl=False,
                   key=None):
    '''
    Note that getHistory will return previous day's prices until 15 minutes after the market
        closes. We will generate a warning if our start or end date differ from the date of the
        response. Given todays date at 14:00, it will retrive the previous business days stuff.
        Given not start parameter, we will return data for the last weekday. Today or earlier.
        We will return everything we between start and end. It may be incomplete.
        Its now limiting yesterdays data. At 3:00, the latest I get is yesterday
        up to 12 noon.
    Retrieve candle data measured in minutes as given in the minutes parameter
    :params start: A datetime object or time string to indicate the begin time for the data. By
        default, start will be set to the most recent weekday at market open.
    :params end: A datetime object or time string to indicate the end time for the data
    :params minutes: An int for the candle time, 5 minute, 15 minute etc
    :return (status, data): A tuple of (status as dictionary, data as a DataFrame ) This status is
        seperate from request status_code.
    :raise: ValueError if response.status_code is not 200.
    '''
    if getLimitReached('bc'):
        msg = 'BarChart limit was reached'
        logging.info(msg)
        return {'code': 666, 'message': msg}, pd.DataFrame(), None

    logging.info(
        '======= Called Barchart -- 150 call limit, data available after market close ======='
    )
    if not end:
        tdy = dt.datetime.today()
        end = dt.datetime(tdy.year, tdy.month, tdy.day, 17, 0)
    # end

    if not start:
        tdy = dt.datetime.today()
        start = dt.datetime(tdy.year, tdy.month, tdy.day, 6, 0)
        start = getLastWorkDay(start)
    end = pd.to_datetime(end)
    start = pd.to_datetime(start)
    # startDay = start.strftime("%Y%m%d")

    # Get the maximum data in order to set the 200 MA on a 60 minute chart
    fullstart = pd.Timestamp.today()
    fullstart = fullstart - pd.Timedelta(days=40)
    fullstart = fullstart.strftime("%Y%m%d")

    params = setParams(symbol, minutes, fullstart, key=key)

    response = requests.get(BASE_URL, params=params)
    if showUrl:
        logging.info(response.url)

    if response.status_code != 200:
        raise Exception(
            f"{response.status_code}: {response.content.decode('utf-8')}")
    meta = {'code': 200}
    if (response.text and isinstance(response.text, str)
            and response.text.startswith('You have reached')):
        d = pd.Timestamp.now()
        dd = pd.Timestamp(d.year, d.month, d.day + 1, 3, 0, 0)
        setLimitReached('bc', dd)

        logging.warning(f'API max queries: {response.text}')
        meta['message'] = response.text
        return meta, pd.DataFrame(), None

    result = response.json()
    if not result['results']:
        logging.warning(
            '''Failed to retrieve any data. Barchart sends the following greeting: {result['status']}'''
        )
        return result['status'], pd.DataFrame(), None

    meta['message'] = result['status']['message']
    df = pd.DataFrame(result['results'])

    for i, row in df.iterrows():
        d = pd.Timestamp(row['timestamp'])
        newd = pd.Timestamp(d.year, d.month, d.day, d.hour, d.minute, d.second)
        df.at[i, 'timestamp'] = newd

    df.set_index(df.timestamp, inplace=True)
    df.index.rename('date', inplace=True)
    maDict = movingAverage(df.close, df, start)

    if start > df.index[0]:
        rstart = df.index[0]
        rend = df.index[-1]
        df = df.loc[df.index >= start]
        for ma in maDict:
            maDict[ma] = maDict[ma].loc[maDict[ma].index >= start]

        lendf = len(df)
        if lendf == 0:
            msg = '\nWARNING: all data has been removed.'
            msg = msg + f'\nThe Requested start was({start}).'
            msg = msg + f'\nBarchart returned data beginning {rstart} and ending {rend}'
            msg += '''If you are seeking a chart from today, its possible Barchart has not made'''
            msg += 'the data available yet. (Should be available by 4:45PM but they are occasionally late)'
            msg += 'You can copy the image yourself, wait, or try a different API. Open File->StockAPI'
            logging.warning(msg)
            meta['code2'] = 199
            meta['message'] = meta['message'] + msg
            return meta, df, maDict

    if end < df.index[-1]:
        df = df.loc[df.index <= end]
        for ma in maDict:
            maDict[ma] = maDict[ma].loc[maDict[ma].index <= end]

        # If we just sliced off all our data. Set warning message
        lendf = len(df)
        if lendf == 0:
            msg = '\nWARNING: all data has been removed.'
            msg = msg + f'\nThe Requested end was({end}).'
            meta['code2'] = 199
            meta['message'] = meta['message'] + msg
            logging.warning(f'{meta}')
            return meta, df, maDict

    deleteMe = list()
    for key in maDict:
        if key == 'vwap':
            continue
        if len(df) != len(maDict[key]):
            deleteMe.append(key)
    for key in deleteMe:
        del maDict[key]

    # Note we are dropping columns  ['symbol', 'timestamp', 'tradingDay[] in favor of ohlcv
    df = df[['open', 'high', 'low', 'close', 'volume']].copy(deep=True)
    return meta, df, maDict
Пример #4
0
class TestNumericArraylikeArithmeticWithTimedeltaLike(object):

    # TODO: also check name retentention
    @pytest.mark.parametrize('box_cls', [np.array, pd.Index, pd.Series])
    @pytest.mark.parametrize('left', [
        pd.RangeIndex(10, 40, 10)] + [cls([10, 20, 30], dtype=dtype)
                                      for dtype in ['i1', 'i2', 'i4', 'i8',
                                                    'u1', 'u2', 'u4', 'u8',
                                                    'f2', 'f4', 'f8']
                                      for cls in [pd.Series, pd.Index]],
        ids=lambda x: type(x).__name__ + str(x.dtype))
    def test_mul_td64arr(self, left, box_cls):
        # GH#22390
        right = np.array([1, 2, 3], dtype='m8[s]')
        right = box_cls(right)

        expected = pd.TimedeltaIndex(['10s', '40s', '90s'])
        if isinstance(left, pd.Series) or box_cls is pd.Series:
            expected = pd.Series(expected)

        result = left * right
        tm.assert_equal(result, expected)

        result = right * left
        tm.assert_equal(result, expected)

    # TODO: also check name retentention
    @pytest.mark.parametrize('box_cls', [np.array, pd.Index, pd.Series])
    @pytest.mark.parametrize('left', [
        pd.RangeIndex(10, 40, 10)] + [cls([10, 20, 30], dtype=dtype)
                                      for dtype in ['i1', 'i2', 'i4', 'i8',
                                                    'u1', 'u2', 'u4', 'u8',
                                                    'f2', 'f4', 'f8']
                                      for cls in [pd.Series, pd.Index]],
        ids=lambda x: type(x).__name__ + str(x.dtype))
    def test_div_td64arr(self, left, box_cls):
        # GH#22390
        right = np.array([10, 40, 90], dtype='m8[s]')
        right = box_cls(right)

        expected = pd.TimedeltaIndex(['1s', '2s', '3s'])
        if isinstance(left, pd.Series) or box_cls is pd.Series:
            expected = pd.Series(expected)

        result = right / left
        tm.assert_equal(result, expected)

        result = right // left
        tm.assert_equal(result, expected)

        with pytest.raises(TypeError):
            left / right

        with pytest.raises(TypeError):
            left // right

    # TODO: de-duplicate with test_numeric_arr_mul_tdscalar
    def test_ops_series(self):
        # regression test for G#H8813
        td = Timedelta('1 day')
        other = pd.Series([1, 2])
        expected = pd.Series(pd.to_timedelta(['1 day', '2 days']))
        tm.assert_series_equal(expected, td * other)
        tm.assert_series_equal(expected, other * td)

    # TODO: also test non-nanosecond timedelta64 and Tick objects;
    #  see test_numeric_arr_rdiv_tdscalar for note on these failing
    @pytest.mark.parametrize('scalar_td', [
        Timedelta(days=1),
        Timedelta(days=1).to_timedelta64(),
        Timedelta(days=1).to_pytimedelta()],
        ids=lambda x: type(x).__name__)
    def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box):
        # GH#19333
        index = numeric_idx

        expected = pd.timedelta_range('0 days', '4 days')

        index = tm.box_expected(index, box)
        expected = tm.box_expected(expected, box)

        result = index * scalar_td
        tm.assert_equal(result, expected)

        commute = scalar_td * index
        tm.assert_equal(commute, expected)

    def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box):

        if box is not pd.Index and isinstance(three_days, pd.offsets.Tick):
            raise pytest.xfail("Tick division not implemented")

        index = numeric_idx[1:3]

        expected = TimedeltaIndex(['3 Days', '36 Hours'])

        index = tm.box_expected(index, box)
        expected = tm.box_expected(expected, box)

        result = three_days / index
        tm.assert_equal(result, expected)

        with pytest.raises(TypeError):
            index / three_days

    @pytest.mark.parametrize('other', [
        pd.Timedelta(hours=31),
        pd.Timedelta(hours=31).to_pytimedelta(),
        pd.Timedelta(hours=31).to_timedelta64(),
        pd.Timedelta(hours=31).to_timedelta64().astype('m8[h]'),
        np.timedelta64('NaT'),
        np.timedelta64('NaT', 'D'),
        pd.offsets.Minute(3),
        pd.offsets.Second(0)])
    def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box):
        left = tm.box_expected(numeric_idx, box)
        with pytest.raises(TypeError):
            left + other
        with pytest.raises(TypeError):
            other + left
        with pytest.raises(TypeError):
            left - other
        with pytest.raises(TypeError):
            other - left
Пример #5
0
    def prev(self, *arg, **kwarg):
        """Load the previous orbit into .data.

        Note
        ----
        Forms complete orbits across day boundaries. If no data loaded
        then the last orbit of data from the last day is loaded into .data.
        """

        # first, check if data exists
        if not self.sat.empty:
            # set up orbit metadata
            self._calcOrbits()
            # if not close to the first orbit,just pull the previous orbit

            if (self._current > 2) & (self._current <= self.num):
                # load orbit and put it into self.sat.data
                self._getBasicOrbit(orbit=self._current - 1)
                print('Loaded Orbit:%i' % (self._current - 1))

            # if current orbit near the first, must be careful
            elif self._current == 2:
                # first, load prev orbit data
                self._getBasicOrbit(orbit=self._current - 1)

                load_prev = True
                if self.sat._iter_type == 'date':
                    delta = self.sat.index[-1] - self.sat.date
                    if delta >= self.orbit_period:
                        # don't need to load the prev day because this orbit
                        # ends more than a orbital period from start of today's
                        # date
                        load_prev = False

                if load_prev:
                    # need to save this current orbit and load the prev day
                    temp_orbit_data = self.sat[self.sat.date:]
                    # load previous day, which clears orbit breaks info

                    try:
                        self.sat.prev()
                        # combine this next day orbit with previous last orbit
                        if not self.sat.empty:
                            self.sat.data = \
                                self.sat.concat_data([self.sat.data,
                                                      temp_orbit_data])
                            # select first orbit of combined data
                            self._getBasicOrbit(orbit=-1)
                        else:
                            self.sat.next()
                            self._getBasicOrbit(orbit=1)
                    except StopIteration:
                        # if loading the first orbit, of first day of data,
                        # you'll end up here as the attempt to make a full
                        # orbit will move the date backwards, and StopIteration
                        # is made. everything is already ok, just move along
                        pass

                    del temp_orbit_data

                print('Loaded Orbit:%i' % (self._current - 1))

            elif self._current == 0:
                self.load(orbit=-1)
                return

            elif self._current < 2:
                # first, load prev orbit data
                self._getBasicOrbit(orbit=1)
                # need to save this current orbit and load the prev day
                temp_orbit_data = self.sat[self.sat.date:]
                # load previous day, which clears orbit breaks info
                self.sat.prev()
                # combine this next day orbit with previous last orbit

                if not self.sat.empty:
                    load_prev = True
                    if self.sat._iter_type == 'date':
                        delta = self.sat.date - self.sat.index[-1] \
                                + pds.Timedelta('1 day')
                        if delta >= self.orbit_period:
                            # don't need to load the prev day because this
                            # orbit ends more than a orbital period from start
                            # of today's date
                            load_prev = False

                    if load_prev:
                        self.sat.data = self.sat.concat_data([self.sat.data,
                                                              temp_orbit_data])
                        # select second to last orbit of combined data
                        self._getBasicOrbit(orbit=-2)
                    else:
                        # padding from the previous is needed
                        self._getBasicOrbit(orbit=-1)
                        if self.sat._iter_type == 'date':
                            delta = self.sat.date - self.sat.index[-1] \
                                    + pds.Timedelta('1 day')
                            if delta < self.orbit_period:
                                self._current = self.num
                                self.prev()
                else:
                    while self.sat.empty:
                        self.sat.prev()
                    self._getBasicOrbit(orbit=-1)

                del temp_orbit_data
                print('Loaded Orbit:%i' % (self._current - 1))

            else:
                raise Exception(' '.join(('You ended up where nobody should',
                                          'ever be. Talk to someone about',
                                          'this fundamental failure or open',
                                          'an issue at',
                                          'www.github.com/rstonback/pysat')))
            # includes hack to appear to be zero indexed
        else:
            # no data
            while self.sat.empty:
                self.sat.prev()  # raises stopIteration at end of dataset
            self.prev()
Пример #6
0
ty.index = pd.to_datetime(ty.index)
under_fut = data['under_fut']
#fill in the calendar of the expiry dates in Python
c = calendar.Calendar(firstweekday=calendar.SATURDAY)
s_exp_dt = pd.Series(index=under_fut.index, name='OPT_EXP_DT')
for row in under_fut.itertuples():
    contract = row.Index
    del_dt = row.FUT_DLV_DT_FIRST
    monthcal = c.monthdatescalendar(del_dt.year, del_dt.month-1)
    s_exp_dt.loc[contract] = monthcal[3][-1]  #fourth friday of the month
under_fut[s_exp_dt.name] = pd.to_datetime(s_exp_dt)

ty = ty.merge(under_fut, right_on='OPT_EXP_DT',left_on='ticker',right_index=True)

#assign the time to expiry
ty['opt_tau_act365'] = (ty['OPT_EXP_DT'] - ty.index) / pd.Timedelta('365 days')
ty.loc[ty['OPT_EXP_DT'] < ty.index, 'opt_tau_act365'] = 0

# do some data correction: remove some days where the vols are negative and unusable
ty.drop(labels=DATES_TO_KILL, errors='ignore', inplace=True)

put_ivols = ['put_10d', 'put_25d', 'put_40d', 'put_50d', 'put_60d', 'put_75d', 'put_90d', 'hist_put_ivol']
call_ivols = ['call_90d', 'call_75d', 'call_60d', 'call_50d', 'call_40d', 'call_25d', 'call_10d', 'hist_call_ivol']
dt = ty.index.intersection(DATES_CALLS_TO_PUTS)
ty.loc[dt,put_ivols] = ty.loc[dt,call_ivols].values

# summarise the volatilities:
# note that the average does not work as the in the money option implied
# vols are some times wrong....
ty['atm_ivol'] = (ty['put_50d'] + ty['call_50d']) / 2 / 100
ty['10dp_ivol'] = ty['put_10d'] / 100
Пример #7
0
 def wait_time(self):
     return pd.Timedelta(milliseconds=300)
Пример #8
0
import bokeh.models as bm
import bokeh.plotting as bp
import datetime as dt
import numpy as np
import pandas as pd
from pathlib import Path

data_path = Path.cwd() / '../data'

columns = ['time', 'open', 'close', 'high', 'low', 'volume']
candles = {col: [] for col in columns}

for filename in sorted(list(data_path.glob('SSO_15*'))):
    o = pd.read_csv(filename)
    o = o.set_index(pd.DatetimeIndex(o['time']))
    freq = pd.Timedelta(hours=1)
    g = o.groupby(pd.Grouper(freq=freq))
    for name, group in g:
        if group.values.size != 0:
            open_v, close_v = group.price[-1], group.price[0]
            high_v, low_v = group.price.agg([np.max, np.min])
            vol_v = group.volume.sum()
            candles["time"].append((name).strftime('%H:%M %d/%m-%y'))
            candles["open"].append(open_v)
            candles["close"].append(close_v)
            candles["high"].append(high_v)
            candles["low"].append(low_v)
            candles["volume"].append(vol_v)
        else:
            candles["time"].append(None)
            candles["open"].append(None)
Пример #9
0
def test_applymap_str():
    # GH 2786
    df = DataFrame(np.random.random((3, 4)))
    df2 = df.copy()
    cols = ["a", "a", "a", "a"]
    df.columns = cols

    expected = df2.applymap(str)
    expected.columns = cols
    result = df.applymap(str)
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
    "col, val",
    [["datetime", Timestamp("20130101")], ["timedelta", pd.Timedelta("1 min")]],
)
def test_applymap_datetimelike(col, val):
    # datetime/timedelta
    df = DataFrame(np.random.random((3, 4)))
    df[col] = val
    result = df.applymap(str)
    assert result.loc[0, col] == str(df.loc[0, col])


@pytest.mark.parametrize(
    "expected",
    [
        DataFrame(),
        DataFrame(columns=list("ABC")),
        DataFrame(index=list("ABC")),
Пример #10
0
 def str_to_timedelta(f):
     return pd.Timedelta(to_offset(f)).to_pytimedelta()
Пример #11
0
def load_data():
    data_df = pd.DataFrame([])

    for file in ['confirmed', 'deaths']:
        file_df = pd.read_csv(remote_path(
            'raw/paho/{}.timeline.csv'.format(file)),
                              index_col=[0],
                              header=[0, 1])

        try:
            file_patch_df = pd.read_csv(remote_path(
                'raw/paho/{}.timeline.daily.patch.csv'.format(file)),
                                        index_col=[0],
                                        header=[0, 1])

            file_df.update(file_patch_df)
        except pd.io.parsers.EmptyDataError:
            pass

        file_df.columns.names = ['', '']
        file_df.index.name = ''

        file_df = file_df['BOL']

        file_df = file_df.rename(ADM1_NAME, axis=1)
        file_df.index = pd.to_datetime(file_df.index)
        file_df.index = file_df.index - pd.Timedelta(days=1)

        file_df = file_df[COLUMNS_ORDER]
        file_df.columns = pd.MultiIndex.from_product([[CASES_DATA_NAME[file]],
                                                      file_df.columns])

        # Errores en los datos
        file_df = file_df.astype(np.float64)
        file_df = file_df.drop_duplicates().asfreq('D')
        file_df = file_df.interpolate('from_derivatives', limit_area='inside')

        file_df[(file_df.diff() < 0).shift(-1).fillna(False)] = np.nan
        file_df = file_df.interpolate('from_derivatives', limit_area='inside')

        file_df[file_df.diff() < 0] = np.nan
        file_df = file_df.interpolate('from_derivatives', limit_area='inside')

        file_df = file_df.round().dropna(how='all')
        data_df = pd.concat([data_df, file_df], axis=1)

    data_df = data_df.sort_index()
    data_df = data_df.fillna(method='ffill')

    # Aqui se cambia la definicion de caso recuperado a todos los casos 14 dias
    # despues de ser diagnosticados (deberian ser 10?)
    active_cases = data_df['confirmados'].diff().rolling(window=14).sum()
    active_cases = active_cases.fillna(data_df['confirmados'])
    active_cases.columns = pd.MultiIndex.from_product([['activos'],
                                                       active_cases.columns])
    data_df = pd.concat([data_df, active_cases], axis=1)

    recovered_cases = data_df['confirmados'].shift(periods=14)
    recovered_cases = recovered_cases - data_df['decesos']
    recovered_cases[recovered_cases < 0] = 0
    recovered_cases.columns = pd.MultiIndex.from_product(
        [['recuperados'], recovered_cases.columns])
    data_df = pd.concat([data_df, recovered_cases], axis=1)

    # Testing
    pending, discarded = load_testing_data()

    pending.columns = pd.MultiIndex.from_product([['sospechosos'],
                                                  pending.columns])
    data_df = pd.concat([data_df, pending], axis=1)

    discarded.columns = pd.MultiIndex.from_product([['descartados'],
                                                    discarded.columns])
    data_df = pd.concat([data_df, discarded], axis=1)

    data_df = data_df.rename(
        {
            'confirmados': 'cases',
            'decesos': 'death',
            'activos': 'active_cases',
            'recuperados': 'recovered',
            'sospechosos': 'pending',
            'descartados': 'discarded'
        },
        axis=1)

    data_df = data_df.loc[:data_df['cases'].last_valid_index()]

    return data_df
Пример #12
0
def main(mytimer: func.TimerRequest) -> None:
    utc_timestamp = datetime.datetime.utcnow().replace(
        tzinfo=datetime.timezone.utc).isoformat()

    if mytimer.past_due:
        logging.info('The timer is past due!')

    params = urllib.parse.quote_plus(
        r'Driver={ODBC Driver 17 for SQL Server};Server=tcp:covid19dbserver.database.windows.net,1433;Database=covid19db;Uid=serveradmin@covid19dbserver;Pwd=pzaGuPujnkUnDqZFbWt5;Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;'
    )
    conn_str = 'mssql+pyodbc:///?odbc_connect={}'.format(params)
    engine = create_engine(conn_str, echo=False)
    key_cols_candidates = [
        'Country/Region', 'Province/State', 'District', 'federalstate', 'date'
    ]

    training_period = 21
    forecasting_days = 3
    forecast_col = 'infections'

    for table_name in ['Hopkins', 'ECDC', 'HopkinsTS', 'RKI']:
        logging.info(f"Processing table {table_name}...")
        # for table_name in ['RKI']:
        df = pd.read_sql_table(table_name, engine)
        df = df.drop([
            'Province/State', 'District', 'FIPS', 'Lat', 'Long', 'deaths',
            'recovered', 'ID'
        ],
                     axis=1,
                     errors='ignore')
        string_cols = df.dtypes[df.dtypes == 'object'].index
        string_col_replacement = {key: "None" for key in string_cols}
        df = df.fillna(string_col_replacement)
        key_cols = [col for col in key_cols_candidates if col in df.columns]
        df = df.groupby(by=key_cols).sum().reset_index()
        for col in string_cols:
            if col in df.columns:
                df.loc[df[col].str.contains("None"), col] = pd.NA

        country_col = 'Country/Region'
        if table_name == 'RKI':
            country_col = 'federalstate'
        for country in df[country_col].unique():
            logging.debug(f"Computing forecasts for country {country}")
            df_country = df[df[country_col] == country]
            y = df_country.sort_values(
                by='date')[forecast_col].values[-training_period:]
            if (y < 10).all():
                # All values < 10. No good forecast possible
                continue
            if len(y) < training_period:
                # Did not find a lot of datapoints
                continue
            x = range(len(y))
            x_forecast = range(len(y), len(y) + forecasting_days)
            try:
                (a_scipy,
                 b_scipy), _ = curve_fit(lambda t, a, b: a * np.exp(b * t), x,
                                         y)
            except Exception as e:
                logging.info(table_name, country, y)
                logging.error(e)
                continue

            def exp_scipy(x):
                return a_scipy * np.exp(b_scipy * x)

            y_forecast = exp_scipy(x_forecast)
            df_result = pd.DataFrame()
            df_result['forecast_infections'] = y_forecast
            one_day_delta = pd.Timedelta(value=1, unit='d')
            df_result['date'] = pd.date_range(start=df_country.date.max() +
                                              one_day_delta,
                                              periods=forecasting_days,
                                              freq='d')
            df_result['forecast_infections'] = y_forecast
            df_result[country_col] = country
            today = datetime.datetime.now()
            df_result['forecasting_date'] = today

            dtype_dict = {}
            for str_col in string_cols:
                if (str_col in df_result.columns
                        and df_result[str_col].notnull().sum() > 0):
                    # print(col)
                    df_result.loc[df_result[str_col].notnull(),
                                  str_col] = df_result.loc[
                                      df_result[str_col].notnull(),
                                      str_col].str.slice(start=0, stop=99)
                    dtype_dict[str_col] = sqlalchemy.types.NVARCHAR(length=100)
            logging.debug("Computed forecasts.")
            logging.debug("Writing forecast to database...")

            df_result.to_sql(f"{table_name}_forecast",
                             engine,
                             if_exists='append',
                             index=False,
                             dtype=dtype_dict)
            logging.debug("Wrote forecast to database.")
            # TODO: Write merge statement to update into f{table_name}_forecast or just run once a day

    logging.info('Python timer trigger function ran at %s', utc_timestamp)
Пример #13
0
            'datetime64[s]'), period=period)
        edges = (pd.DatetimeIndex([t_interval_start]).append(t_intervals_end[:-1]),
                 pd.DatetimeIndex(t_intervals_end))

    for i, probe in enumerate(probes):
        probe_name = f'{prefix}{probe:02}'  # table name in db
        l.info('Draw %s in Veusz: %d intervals...', probe_name, edges[0].size)
        # for i_interval, (t_interval_start, t_interval_end) in enumerate(zip(pd.DatetimeIndex([t_interval_start]).append(t_intervals_end[:-1]), t_intervals_end), start=1):

        cfg_vp = {'veusze': None}
        for i_interval, (t_interval_start, t_interval_end) in enumerate(zip(*edges), start=1):

            # if i_interval < 23: #<= 0:  # TEMPORARY Skip this number of intervals
            #     continue
            if period != length:
                t_interval_start = t_interval_end - pd.Timedelta(dt_custom_s, 's')

            try:  # skipping absent probes
                start_end = h5q_interval2coord(
                    db_path=str(db_path),
                    table=f'/{probe_name}',
                    t_interval=(t_interval_start, t_interval_end))
                if not len(start_end):
                    break  # no data
            except KeyError:
                break  # device name not in specified range, go to next name

            pattern_path_new = pattern_path.with_name(f"{t_interval_start:%y%m%d_%H%M}_{length}_{probe_name}.vsz")

            # Modify pattern file
            if not b_images_only:
Пример #14
0
    def test_indexing_with_datetime_tz(self):

        # 8260
        # support datetime64 with tz

        idx = Index(date_range('20130101', periods=3, tz='US/Eastern'),
                    name='foo')
        dr = date_range('20130110', periods=3)
        df = DataFrame({'A': idx, 'B': dr})
        df['C'] = idx
        df.iloc[1, 1] = pd.NaT
        df.iloc[1, 2] = pd.NaT

        # indexing
        result = df.iloc[1]
        expected = Series([
            Timestamp('2013-01-02 00:00:00-0500', tz='US/Eastern'), np.nan,
            np.nan
        ],
                          index=list('ABC'),
                          dtype='object',
                          name=1)
        tm.assert_series_equal(result, expected)
        result = df.loc[1]
        expected = Series([
            Timestamp('2013-01-02 00:00:00-0500', tz='US/Eastern'), np.nan,
            np.nan
        ],
                          index=list('ABC'),
                          dtype='object',
                          name=1)
        tm.assert_series_equal(result, expected)

        # indexing - fast_xs
        df = DataFrame({'a': date_range('2014-01-01', periods=10, tz='UTC')})
        result = df.iloc[5]
        expected = Timestamp('2014-01-06 00:00:00+0000', tz='UTC', freq='D')
        assert result == expected

        result = df.loc[5]
        assert result == expected

        # indexing - boolean
        result = df[df.a > df.a[3]]
        expected = df.iloc[4:]
        tm.assert_frame_equal(result, expected)

        # indexing - setting an element
        df = DataFrame(data=pd.to_datetime(
            ['2015-03-30 20:12:32', '2015-03-12 00:11:11']),
                       columns=['time'])
        df['new_col'] = ['new', 'old']
        df.time = df.set_index('time').index.tz_localize('UTC')
        v = df[df.new_col == 'new'].set_index('time').index.tz_convert(
            'US/Pacific')

        # trying to set a single element on a part of a different timezone
        # this converts to object
        df2 = df.copy()
        df2.loc[df2.new_col == 'new', 'time'] = v

        expected = Series([v[0], df.loc[1, 'time']], name='time')
        tm.assert_series_equal(df2.time, expected)

        v = df.loc[df.new_col == 'new', 'time'] + pd.Timedelta('1s')
        df.loc[df.new_col == 'new', 'time'] = v
        tm.assert_series_equal(df.loc[df.new_col == 'new', 'time'], v)
Пример #15
0
    'dataset':
    dataset,
    'grid_res':
    2.5,
    'startyear':
    1979,  # download startyear
    'endyear':
    2020,  # download endyear
    'months':
    list(range(1, 12 + 1)),  #downoad months
    # for monthly means of daily means, choose 'moda' or 'mnth'
    # for daily means choose 'oper' or 'enda' (for accumulations)
    'stream':
    'oper',
    'time':
    pd.date_range(start='00:00', end='23:00', freq=(pd.Timedelta(3,
                                                                 unit='h'))),
    'area':
    'global',  # [North, West, South, East]. Default: global
    'CDO_command':
    'daymean',
    'base_path':
    base_path,
    'path_raw':
    path_raw
})

if ex['dataset'] == 'ERAint' or ex['dataset'] == 'era20c':
    import download_ERA_interim_API as ECMWF
elif ex['dataset'] == 'era5':
    import download_ERA5_API as ECMWF
Пример #16
0
def dates_to_idx(timelist):
    reference_time = pd.to_datetime('1958-03-15')
    t = (timelist - reference_time) / pd.Timedelta(1, "Y")
    return np.asarray(t)
Пример #17
0
    def test_examples2(self):
        """ doc-string examples """

        trades = pd.DataFrame(
            {
                'time':
                pd.to_datetime([
                    '20160525 13:30:00.023', '20160525 13:30:00.038',
                    '20160525 13:30:00.048', '20160525 13:30:00.048',
                    '20160525 13:30:00.048'
                ]),
                'ticker': ['MSFT', 'MSFT', 'GOOG', 'GOOG', 'AAPL'],
                'price': [51.95, 51.95, 720.77, 720.92, 98.00],
                'quantity': [75, 155, 100, 100, 100]
            },
            columns=['time', 'ticker', 'price', 'quantity'])

        quotes = pd.DataFrame(
            {
                'time':
                pd.to_datetime([
                    '20160525 13:30:00.023', '20160525 13:30:00.023',
                    '20160525 13:30:00.030', '20160525 13:30:00.041',
                    '20160525 13:30:00.048', '20160525 13:30:00.049',
                    '20160525 13:30:00.072', '20160525 13:30:00.075'
                ]),
                'ticker': [
                    'GOOG', 'MSFT', 'MSFT', 'MSFT', 'GOOG', 'AAPL', 'GOOG',
                    'MSFT'
                ],
                'bid':
                [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
                'ask':
                [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
            },
            columns=['time', 'ticker', 'bid', 'ask'])

        pd.merge_asof(trades, quotes, on='time', by='ticker')

        pd.merge_asof(trades,
                      quotes,
                      on='time',
                      by='ticker',
                      tolerance=pd.Timedelta('2ms'))

        expected = pd.DataFrame(
            {
                'time':
                pd.to_datetime([
                    '20160525 13:30:00.023', '20160525 13:30:00.038',
                    '20160525 13:30:00.048', '20160525 13:30:00.048',
                    '20160525 13:30:00.048'
                ]),
                'ticker': ['MSFT', 'MSFT', 'GOOG', 'GOOG', 'AAPL'],
                'price': [51.95, 51.95, 720.77, 720.92, 98.00],
                'quantity': [75, 155, 100, 100, 100],
                'bid': [np.nan, 51.97, np.nan, np.nan, np.nan],
                'ask': [np.nan, 51.98, np.nan, np.nan, np.nan]
            },
            columns=['time', 'ticker', 'price', 'quantity', 'bid', 'ask'])

        result = pd.merge_asof(trades,
                               quotes,
                               on='time',
                               by='ticker',
                               tolerance=pd.Timedelta('10ms'),
                               allow_exact_matches=False)
        assert_frame_equal(result, expected)
    plt.pause(2)


df_path = pd.read_csv('../spencers_data/path1.csv')
df_path['Start_time'] = pd.to_datetime(df_path['Start_time'])
df_path['Start_time'] = df_path['Start_time'].apply(
    lambda x: x.strftime('%Y-%m-13 %H:%M:%S'))
df_path['Start_time'] = pd.to_datetime(df_path['Start_time'])

count = len(df_path)
for index, row in df_path.iterrows():
    if (row['Duration (min)'] == 2):
        df_path.loc[index, 'Duration (min)'] = 1
        df_path.loc[count, :] = df_path.loc[index, :]
        df_path.loc[count, 'Start_time'] = df_path.loc[count,
                                                       'Start_time'] + pd.Timedelta('1 minute')
        count += 1

df_path.sort_values('Start_time', inplace=True)
df_path.reset_index(drop=True, inplace=True)
df_path['Start_time'] = df_path['Start_time'] + pd.Timedelta('12 hours')

# print (df_path)

f = ['micromax', 'moto', 'oneplus', 'samsung', 'yureka']
for file in f:
    count = 0
    df_loc_track = pd.read_csv(
        '../spencers_data/device_modified_logs_min/' + str(file) + '.csv')
    df_loc_track['ts'] = pd.to_datetime(df_loc_track['ts'])
    df_loc_track['ts'] = df_loc_track['ts'] + \
Пример #19
0
import pandas as pd
from six.moves.urllib.parse import urlencode

from catalyst.data.bundles.core import register_bundle
from catalyst.data.bundles.base_pricing import BaseEquityPricingBundle
from catalyst.utils.memoize import lazyval
"""
Module for building a complete daily dataset from Quandl's WIKI dataset.
"""
from logbook import Logger

from catalyst.constants import LOG_LEVEL
from catalyst.utils.calendars import register_calendar_alias

log = Logger(__name__, level=LOG_LEVEL)
seconds_per_call = (pd.Timedelta('10 minutes') / 2000).total_seconds()


class QuandlBundle(BaseEquityPricingBundle):
    @lazyval
    def name(self):
        return 'quandl'

    @lazyval
    def exchange(self):
        return 'QUANDL'

    @lazyval
    def frequencies(self):
        return set(('daily', ))
def preprocessing(gross, social):
    """
    In this function, we merge two datasets together for further visualization tasks
    :param gross: the cleaned Broadway Grosses Data Set
    :param social: the cleaned Broadway Social Stats Data Set
    :return: the merged data set
    """

    # Match the dates from two data sets
    gross_date = [x for x in gross['week_ending'].unique() if x[0:4] in ['2019', '2018', '2017']]
    temp_date = [x.date().strftime('%Y-%m-%d') for x in list(pd.to_datetime(gross_date) + pd.Timedelta(1, unit='d'))]
    gross_date = gross_date + temp_date

    names = list(social.columns)
    names.append('this_week_gross')
    df = pd.DataFrame(columns=names)

    for i in social['Date'].unique():
        temp = pd.to_datetime(i).date().strftime('%Y-%m-%d')
        if temp in gross_date:
            temp_df = social.loc[social['Date'] == temp, :]
            c = gross.loc[(gross['week_ending'] == temp), ['week_ending', 'show', 'this_week_gross']]
            for j in temp_df.Show:
                for k in c.show:
                    if j in k:
                        temp_df.loc[(temp_df['Show'] == j), 'this_week_gross'] = c.loc[
                            c['show'] == k, 'this_week_gross'].values
                    elif k in j:
                        temp_df.loc[(temp_df['Show'] == j), 'this_week_gross'] = c.loc[
                            c['show'] == k, 'this_week_gross'].values
            df = df.append(temp_df, ignore_index=True)
    df_notnull = df.dropna()
    return df_notnull
Пример #21
0
def setup_perioddata_group(start_date_time, end_date_time=None,
                           nper=1, perlen=None, model_time_units=None, freq=None,
                           steady={0: True,
                             1: False},
                           nstp=10, tsmult=1.5,
                           oc_saverecord={0: ['save head last',
                             'save budget last']},
                           ):
    """Sets up time discretization for a model; outputs a DataFrame with
    stress period dates/times and properties. Stress periods can be established
    with an established explicitly by specifying perlen as a list of period lengths in
    model units. Or, stress periods can be established using three of the
    start_date, end_date_time, nper, and freq arguments, similar to the
    pandas.date_range function.
    (see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html)

    Parameters
    ----------
    start_date_time_time : str or datetime-like
        Left bound for generating stress period dates. See pandas documenation.
    end_date_time : str or datetime-like, optional
        Right bound for generating stress period dates. See pandas documenation.
    nper : int, optional
        Number of stress periods. Only used if perlen is None, or in combination with freq
        if an end_date_time isn't specified.
    perlen : sequence or None, optional
        A list of stress period lengths in model time units. Or specify as None and
        specify 3 of start_date_time, end_date_time, nper and/or freq.
    model_time_units : str, optional
        'days' or 'seconds'.
    freq : str or DateOffset, default None
        For setting up uniform stress periods between a start and end date, or of length nper.
        Same as argument to pandas.date_range. Frequency strings can have multiples,
        e.g. ‘6MS’ for a 6 month interval on the start of each month.
        See the pandas documentation for a list of frequency aliases. Note: Only "start"
        frequences (e.g. MS vs M for "month end") are supported.
    steady : dict
        Dictionary with zero-based stress periods as keys and boolean values. Similar to MODFLOW-6
        input, the information specified for a period will continue to apply until
        information for another period is specified.
    nstp : int or sequence
        Number of timesteps in a stress period. Must be an integer if perlen=None.
    nstp : int or sequence
        Timestep multiplier for a stress period. Must be an integer if perlen=None.
    oc_saverecord : dict
        Dictionary with zero-based stress periods as keys and output control options as values.
        Similar to MODFLOW-6 input, the information specified for a period will
        continue to apply until information for another perior is specified.

    Returns
    -------
    perrioddata : pandas.DataFrame
        DataFrame summarizing stress period information. Data columns:

        ==================  ================  ==============================================
        **start_datetime**  pandas datetimes  start date/time of each stress period (does not include steady-state periods)
        **end_datetime**    pandas datetimes  end date/time of each stress period (does not include steady-state periods)
        **time**            float             cumulative MODFLOW time at end of period (includes steady-state periods)
        **per**             int               zero-based stress period
        **perlen**          float             stress period length in model time units
        **nstp**            int               number of timesteps in the stress period
        **tsmult**          int               timestep multiplier for stress period
        **steady**          bool              True=steady-state, False=Transient
        **oc**              dict              MODFLOW-6 output control options
        ==================  ================  ==============================================

    """
    # todo: refactor/simplify setup_perioddata_group
    freq = convert_freq_to_period_start(freq)
    oc = oc_saverecord
    if not isinstance(steady, dict):
        steady = {i: v for i, v in enumerate(steady)}

    txt = "Specify perlen as a list of lengths in model units, or\nspecify 3 " \
          "of start_date_time, end_date_time, nper and/or freq."
    # Explicitly specified stress period lengths
    if perlen is not None:
        if np.isscalar(perlen):
            perlen = [perlen]
        datetimes = [pd.Timestamp(start_date_time)]
        if len(perlen) > 1:
            for i, length in enumerate(perlen[1:]):
                datetimes.append(datetimes[i] + pd.Timedelta(length, unit=model_time_units))
        time = np.cumsum(perlen) # time in MODFLOW units
    elif nper == 1 and steady[0]:
        perlen = [1]
        time = [1]
        #datetimes = [pd.Timestamp(start_date_time)]

    # Set up datetimes based on 3 of start_date_time, end_date_time, nper and/or freq (scalar perlen)
    else:
        assert np.isscalar(nstp), "nstp: {}; nstp must be a scalar if perlen " \
                                  "is not specified explicitly as a list.\n{}".format(nstp, txt)
        assert np.isscalar(tsmult), "tsmult: {}; tsmult must be a scalar if perlen " \
                                  "is not specified explicitly as a list.\n{}".format(tsmult, txt)
        periods = None
        if end_date_time is None:
            # start_date_time, periods and freq
            # (i.e. nper periods of length perlen starting on stat_date)
            if freq is not None:
                periods = nper
            else:
                raise ValueError("Unrecognized input for perlen: {}.\n{}".format(perlen, txt))
        else:
            # end_date_time and freq and periods
            if start_date_time is None:
                periods = nper + 1
            # start_date_time, end_date_time and (linearly spaced) periods
            # (i.e. nper periods of uniform length between start_date_time and end_date_time)
            elif freq is None:
                periods = nper #-1 if steady[0] else nper
            # start_date_time, end_date_time and frequency
            elif freq is not None:
                pass
        datetimes = pd.date_range(start_date_time, end_date_time, periods=periods, freq=freq)
        if start_date_time is None:
            start_date_time = datetimes[0]  # in case end_date_time, periods and freq were specified
        if len(datetimes) == 1:
            perlen = [(pd.Timestamp(end_date_time) - pd.Timestamp(start_date_time)).days]
            time = np.array(perlen)
        else:
            # time is at the end of each stress period
            time = getattr((datetimes - pd.Timestamp(start_date_time)), model_time_units).tolist()

            # get the last (end) time, if it wasn't included in datetimes
            if datetimes[0] == pd.Timestamp(start_date_time) and nper is None:
                if end_date_time is not None:
                    # + 1 for consistency with using date_range below
                    # e.g. to end at 2019-01-01 instead of 2018-12-31
                    last_time = getattr((pd.Timestamp(end_date_time) -
                                         pd.Timestamp(start_date_time)),
                                        model_time_units) + 1
                else:
                    end_datetimes = pd.date_range(start_date_time,
                                                  periods=len(datetimes) + 1,
                                                  freq=freq)
                    last_time = getattr((end_datetimes[-1] -
                                         pd.Timestamp(start_date_time)),
                                         model_time_units)
                if last_time != time[-1]:
                    time += [last_time]
        if time[0] != 0:
            time = [0] + list(time)
        perlen = np.diff(time)
        time = np.array(time[1:])
        assert len(perlen) == len(time)  # == len(datetimes)

        # if first period is steady-state,
        # insert it at the beginning of the generated range
        # this should only apply to cases where nper > 1
        if steady[0]:
            #datetimes = [datetimes[0]] + datetimes.tolist()  #  datetimes[:-1].tolist()
            perlen = [1] + list(perlen)
            time = [1] + (time + 1).tolist()
        else:
            pass
            #datetimes = datetimes[:-1]
            #perlen = np.diff(time).tolist()
            #time = time[1:]

    perioddata = pd.DataFrame({#'datetime': datetimes,
                               'time': time,
                               'per': range(len(time)),
                               'perlen': np.array(perlen).astype(float),
                               'nstp': nstp,
                               'tsmult': tsmult,
                               })

    # specify steady-state or transient for each period, filling empty
    # periods with previous state (same logic as MF6 input)
    issteady = [steady[0]]
    for i in range(len(perioddata)):
        issteady.append(steady.get(i, issteady[i]))
    perioddata['steady'] = issteady[1:]

    # set up output control, using previous value to fill empty periods
    # (same as MF6)
    oclist = [None]
    for i in range(len(perioddata)):
        oclist.append(oc.get(i, oclist[i]))
    perioddata['oc'] = oclist[1:]

    # create start and end datetime columns;
    # correct the datetime to only increment for transient stress periods
    start_datetime = [pd.Timestamp(start_date_time)]
    end_datetime = []
    for i, r in perioddata.iterrows():
        if r.steady:
            end_datetime.append(start_datetime[i])
        else:
            end_datetime.append(start_datetime[i] + pd.Timedelta(r.perlen, unit=model_time_units))
        start_datetime.append(end_datetime[i])

    perioddata['start_datetime'] = start_datetime[:-1]
    perioddata['end_datetime'] = end_datetime
    cols = ['start_datetime', 'end_datetime', 'time', 'per', 'perlen', 'nstp', 'tsmult', 'steady', 'oc']
    #perioddata = perioddata.drop('datetime', axis=1)[cols]

    # correct nstp and tsmult to be 1 for steady-state periods
    perioddata.loc[perioddata.steady, 'nstp'] = 1
    perioddata.loc[perioddata.steady, 'tsmult'] = 1
    return perioddata
Пример #22
0
 def lead(n):
     m = mod.isel(Time=n)
     m['start'] = m.start + pd.Timedelta(n, 'd')
     return m.resample(offset, 'start', how='mean').to_series()
Пример #23
0
    def next(self, *arg, **kwarg):
        """Load the next orbit into .data.

        Note
        ----
        Forms complete orbits across day boundaries. If no data loaded
        then the first orbit from the first date of data is returned.
        """

        # first, check if data exists
        if not self.sat.empty:
            # set up orbit metadata
            self._calcOrbits()

            # if current orbit near the last, must be careful
            if self._current == (self.num - 1):
                # first, load last orbit data
                self._getBasicOrbit(orbit=-1)
                # End of orbit may occur on the next day
                load_next = True
                if self.sat._iter_type == 'date':
                    delta = self.sat.date - self.sat.index[-1] \
                            + pds.Timedelta('1 day')
                    if delta >= self.orbit_period:
                        # don't need to load the next day because this orbit
                        # ends more than a orbital period from the next date
                        load_next = False

                if load_next:
                    # the end of the user's desired orbit occurs tomorrow, need
                    # to form a complete orbit save this current orbit, load
                    # the next day, combine data, select the correct orbit
                    temp_orbit_data = self.sat.copy()
                    try:
                        # loading next day/file clears orbit breaks info
                        self.sat.next()
                        if not self.sat.empty:
                            # combine this next day's data with previous last
                            # orbit, grab the first one
                            final_val = self.sat.index[0] \
                                - pds.DateOffset(microseconds=1)
                            self.sat.data = self.sat.concat_data(
                                [temp_orbit_data[:final_val],
                                 self.sat.data])
                            self._getBasicOrbit(orbit=1)
                        else:
                            # no data, go back a day and grab the last orbit.
                            # As complete as orbit can be
                            self.sat.prev()
                            self._getBasicOrbit(orbit=-1)
                    except StopIteration:
                        pass
                    del temp_orbit_data
                # includes hack to appear to be zero indexed
                print('Loaded Orbit:%i' % (self._current - 1))

            elif self._current == (self.num):
                # at the last orbit, need to be careful about getting the next
                # orbit save this current orbit and load the next day
                # temp_orbit_data = self.sat.data.copy()
                temp_orbit_data = self.sat.copy()
                # load next day, which clears orbit breaks info
                self.sat.next()
                # combine this next day orbit with previous last orbit to
                # ensure things are correct
                if not self.sat.empty:
                    pad_next = True
                    # check if data padding is really needed, only works when
                    # loading by date
                    if self.sat._iter_type == 'date':
                        delta = self.sat.date - temp_orbit_data.index[-1]
                        if delta >= self.orbit_period:
                            # the end of the previous orbit is more than an
                            # orbit away from today we don't have to worry
                            # about it
                            pad_next = False
                    if pad_next:
                        # orbit went across day break, stick old orbit onto new
                        # data and grab second orbit (first is old)
                        self.sat.data = self.sat.concat_data(
                            [temp_orbit_data[:self.sat.index[0] -
                                             pds.DateOffset(microseconds=1)],
                             self.sat.data])
                        # select second orbit of combined data
                        self._getBasicOrbit(orbit=2)
                    else:
                        # padding from the previous orbit wasn't needed, can
                        # just grab the first orbit of loaded data
                        self._getBasicOrbit(orbit=1)
                        if self.sat._iter_type == 'date':
                            delta = self.sat.date + pds.DateOffset(days=1) \
                                    - self.sat.index[0]

                            if delta < self.orbit_period:
                                # this orbits end occurs on the next day,
                                # though we grabbed the first orbit, missing
                                # data means the first available orbit in the
                                # datais actually the last for the day.
                                # Resetting to the second to last orbit and t
                                # hen callingnext() will get the last orbit,
                                # accounting for tomorrow's data as well.
                                self._current = self.num - 1
                                self.next()
                else:
                    # no data for the next day
                    # continue loading data until there is some
                    # nextData raises StopIteration when it reaches the end,
                    # leaving this function
                    while self.sat.empty:
                        self.sat.next()
                    self._getBasicOrbit(orbit=1)

                del temp_orbit_data
                # includes hack to appear to be zero indexed
                print('Loaded Orbit:%i' % (self._current - 1))

            elif self._current == 0:
                # no current orbit set, grab the first one
                # using load command to specify the first orbit, which
                # automatically loads prev day if needed to form complete orbit
                self.load(orbit=1)

            elif self._current < (self.num - 1):
                # since we aren't close to the last orbit, just pull the next
                # orbit
                self._getBasicOrbit(orbit=self._current + 1)
                # includes hack to appear to be zero indexed
                print('Loaded Orbit:%i' % (self._current - 1))

            else:
                raise Exception(' '.join(('You ended up where nobody should',
                                          'ever be. Talk to someone about',
                                          'this fundamental failure or open',
                                          'an issue at',
                                          'www.github.com/rstonback/pysat')))

        else:  # no data
            while self.sat.empty:
                # keep going until data is found
                # next raises stopIteration at end of data set, no more data
                # possible
                self.sat.next()
            # we've found data, grab the next orbit
            self.next()
Пример #24
0
def df_stability_metrics(
    df,
    time_axis,
    features=None,
    binning="auto",
    bin_specs=None,
    time_width=None,
    time_offset=0,
    var_dtype=None,
    reference_type="self",
    reference=None,
    window=10,
    shift=1,
    monitoring_rules=None,
    pull_rules=None,
    **kwargs,
):
    """Create a data stability monitoring html datastore for given pandas or spark dataframe.

    :param df: input pandas/spark dataframe to be profiled and monitored over time.
    :param str time_axis: name of datetime feature, used as time axis, eg 'date'. if True, will be auto-guessed.
        If time_axis is set or found, and if no features provided, features becomes: ['date:x', 'date:y', 'date:z'] etc.
    :param list features: columns to pick up from input data. (default is all features).
        For multi-dimensional histograms, separate the column names with a ':'. Example features list is:

        .. code-block:: python

            features = ['x', 'date', 'date:x', 'date:y', 'date:x:y']

    :param str binning: default binning to revert to in case bin_specs not supplied. options are:
        "unit" or "auto", default is "auto". When using "auto", semi-clever binning is automatically done.
    :param dict bin_specs: dictionaries used for rebinning numeric or timestamp features.
        An example bin_specs dictionary is:

        .. code-block:: python

            bin_specs = {'x': {'bin_width': 1, 'bin_offset': 0},
                         'y': {'num': 10, 'low': 0.0, 'high': 2.0},
                         'x:y': [{}, {'num': 5, 'low': 0.0, 'high': 1.0}]}

        In the bin specs for x:y, x is not provided (here) and reverts to the 1-dim setting.
        The 'bin_width', 'bin_offset' notation makes an open-ended histogram (for that feature) with given bin width
        and offset. The notation 'num', 'low', 'high' gives a fixed range histogram from 'low' to 'high' with 'num'
        number of bins.
    :param time_width: bin width of time axis. str or number (ns). note: bin_specs takes precedence. (optional)

        .. code-block:: text

            Examples: '1w', 3600e9 (number of ns),
                      anything understood by pd.Timedelta(time_width).value

    :param time_offset: bin offset of time axis. str or number (ns). note: bin_specs takes precedence. (optional)

        .. code-block:: text

            Examples: '1-1-2020', 0 (number of ns since 1-1-1970),
                      anything parsed by pd.Timestamp(time_offset).value

    :param dict var_dtype: dictionary with specified datatype per feature. auto-guessed when not provided.
    :param reference_type: type or reference used for comparisons. Options [self, external, rolling, expanding].
        default is 'self'.
    :param reference: reference dataframe or histograms. default is None
    :param int window: size of rolling window and/or trend detection. default is 10.
    :param int shift: shift of time-bins in rolling/expanding window. default is 1.
    :param dict monitoring_rules: monitoring rules to generate traffic light alerts.
        The default setting is:

        .. code-block:: python

            monitoring_rules = {"*_pull": [7, 4, -4, -7],
                                "*_zscore": [7, 4, -4, -7],
                                "[!p]*_unknown_labels": [0.5, 0.5, 0, 0]}

        Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern.
        For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull".
        You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the
        feature name in front. E.g.

        .. code-block:: python

            monitoring_rules = {"featureA:*_pull": [5, 3, -3, -5],
                                "featureA:nan": [4, 1, 0, 0],
                                "*_pull": [7, 4, -4, -7],
                                "nan": [8, 1, 0, 0]}

        In case of multiple rules could apply for a feature's statistic, the most specific one applies.
        So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule
        for all other features.
    :param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report.
        Default is:

        .. code-block:: python

            pull_rules = {"*_pull": [7, 4, -4, -7]}

        This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean,
        and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean.
        Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern.
        (The same string logic applies as for monitoring_rules.)
    :param kwargs: residual keyword arguments, passed on to stability_report()
    :return: dict with results of metrics pipeline
    """
    # basic checks on presence of time_axis
    if not (isinstance(time_axis, str) and len(time_axis) > 0) and not (
        isinstance(time_axis, bool) and time_axis
    ):
        raise TypeError("time_axis needs to be a filled string or set to True")
    if isinstance(time_axis, str) and time_axis not in df.columns:
        raise ValueError(f'time_axis  "{time_axis}" not found in columns of dataframe.')
    if reference is not None and not isinstance(reference, dict):
        if isinstance(time_axis, str) and time_axis not in reference.columns:
            raise ValueError(
                f'time_axis  "{time_axis}" not found in columns of reference dataframe.'
            )
    if isinstance(time_axis, bool):
        time_axes = get_time_axes(df)
        num = len(time_axes)
        if num == 1:
            time_axis = time_axes[0]
            logger.info(f'Time-axis automatically set to "{time_axis}"')
        elif num == 0:
            raise RuntimeError(
                "No obvious time-axes found. Cannot generate stability report."
            )
        else:
            raise RuntimeError(
                f"Found {num} time-axes: {time_axes}. Set *one* time_axis manually!"
            )
    if features is not None:
        # by now time_axis is defined. ensure that all histograms start with it.
        if not isinstance(features, list):
            raise TypeError(
                "features should be list of columns (or combos) to pick up from input data."
            )
        features = [
            c if c.startswith(time_axis) else f"{time_axis}:{c}" for c in features
        ]

    # interpret time_width and time_offset
    if isinstance(time_width, (str, int, float)) and isinstance(
        time_offset, (str, int, float)
    ):
        if not isinstance(bin_specs, (type(None), dict)):
            raise RuntimeError("bin_specs object is not a dictionary")
        if bin_specs is None:
            bin_specs = {}
        if time_axis in bin_specs:
            raise RuntimeError(
                f'time-axis "{time_axis}" already found in binning specifications.'
            )
        # convert time width and offset to nanoseconds
        time_specs = {
            "bin_width": float(pd.Timedelta(time_width).value),
            "bin_offset": float(pd.Timestamp(time_offset).value),
        }
        bin_specs[time_axis] = time_specs

    reference_hists = None
    if reference is not None:
        reference_type = "external"
        if isinstance(reference, dict):
            # 1. reference is dict of histograms
            # extract features and bin_specs from reference histograms
            reference_hists = reference
            features = list(reference_hists.keys())
            bin_specs = get_bin_specs(reference_hists)
        else:
            # 2. reference is pandas or spark dataframe
            # generate histograms and return updated features, bin_specs, time_axis, etc.
            (
                reference_hists,
                features,
                bin_specs,
                time_axis,
                var_dtype,
            ) = make_histograms(
                reference,
                features,
                binning,
                bin_specs,
                time_axis,
                var_dtype,
                ret_specs=True,
            )

    # use the same features, bin_specs, time_axis, etc as for reference hists
    hists = make_histograms(
        df,
        features=features,
        binning=binning,
        bin_specs=bin_specs,
        time_axis=time_axis,
        var_dtype=var_dtype,
    )

    # generate data stability report
    return stability_metrics(
        hists,
        reference_type,
        reference_hists,
        time_axis,
        window,
        shift,
        monitoring_rules,
        pull_rules,
        features,
        **kwargs,
    )
    def test_timedelta_with_nulls(self):
        df = pd.DataFrame(
            {'test': [pd.Timedelta('1 day'), None,
                      pd.Timedelta('3 day')]})

        self._check_pandas_roundtrip(df, null_counts=[1, 1])
Пример #26
0
def test_str_to_timestamp_rounds_up(timestamp):
    offset = timestamp - pd.Timedelta(minutes=45)
    as_str = offset.isoformat()
    assert str_to_timestamp(as_str) == timestamp
def merge_tti_feature(df_base, df_feature, n_shift):
    df_feature['time'] = df_feature['time'].apply(
        lambda x: x + pd.Timedelta(minutes=10 * n_shift))
    df_train_feat = pd.merge(df_base, df_feature, on='time', how='left')
    df_train_feat.interpolate(inplace=True)
    return df_train_feat
## Combine all name
names = {'ckinyaname': ckinyaname, 'ckinyaname_sx': ckinyaname_sx, 
         'ckinyaname_cor': ckinyaname_cor, 'cothername': cothername,
         'pkinyaname': pkinyaname, 'pkinyaname_sx': pkinyaname_sx, 
         'pkinyaname_cor': pkinyaname_cor, 'pothername': pothername}

names = pd.DataFrame(names)

data = pd.concat([data, names], axis=1)

# =============================================================================
# Estimate DOB
# =============================================================================

months = pd.Series([pd.Timedelta(month, 'M') for month in data['age_mo']])
data = data.assign(dob_est = data['patientvisitdate'] - months)

# =============================================================================
# Centroids
# =============================================================================

coorDict = {sector.upper(): (centroids.iloc[i,7], centroids.iloc[i,8]) 
                        for i, sector in enumerate(centroids.Name)}

coorDict.update({'': (np.NaN, np.NaN)})

coords = [coorDict[sector] for sector in data.sector_clean]

coords = pd.DataFrame(coords)
coords.columns = ['sectLat', 'sectLong']
Пример #29
0
    def split_sessions(self,
                       *,
                       by_event=None,
                       thresh,
                       eos_event=None,
                       session_col='session_id'):

        session_col_arg = session_col or 'session_id'

        index_col = self.retention_config['user_col']
        event_col = self.retention_config['event_col']
        time_col = self.retention_config['event_time_col']

        res = self._obj.copy()

        if by_event is None:
            res[time_col] = pd.to_datetime(res[time_col])
            if thresh is None:
                # add end_of_session event at the end of each string
                res.sort_values(by=time_col, inplace=True, ascending=False)
                res[hash('session')] = res.groupby(index_col).cumcount()
                res_session_ends = res[(res[hash('session')] == 0)].copy()
                res_session_ends[event_col] = eos_event
                res_session_ends[time_col] = res_session_ends[
                    time_col] + pd.Timedelta(seconds=1)

                res = pd.concat([res, res_session_ends])

                res.sort_values(by=time_col, inplace=True)

            else:
                # split sessions by time thresh:
                # drop end_of_session events if already present:
                if eos_event is not None:
                    res = res[res[event_col] != eos_event].copy()

                res.sort_values(by=time_col, inplace=True)
                shift_res = res.groupby(index_col).shift(-1)

                time_delta = pd.to_datetime(
                    shift_res[time_col]) - pd.to_datetime(res[time_col])
                time_delta = time_delta.dt.total_seconds()

                # get boolean mapper for end_of_session occurrences
                eos_mask = time_delta > thresh

                # add session column:
                res[hash('session')] = eos_mask
                res[hash('session')] = res.groupby(index_col)[hash(
                    'session')].cumsum()
                res[hash('session')] = res.groupby(index_col)[hash(
                    'session')].shift(1).fillna(0).map(int).map(str)

                # add end_of_session event if specified:
                if eos_event is not None:
                    tmp = res.loc[eos_mask].copy()
                    tmp[event_col] = eos_event
                    tmp[time_col] += pd.Timedelta(seconds=1)

                    res = pd.concat([res, tmp], ignore_index=True)
                    res = res.sort_values(time_col).reset_index(drop=True)

                res[session_col_arg] = res[index_col].map(str) + '_' + res[
                    hash('session')]

        else:
            # split sessions by event:
            res[hash('session')] = res[event_col] == by_event
            res[hash('session')] = res.groupby(index_col)[hash(
                'session')].cumsum().fillna(0).map(int).map(str)
            res[session_col_arg] = res[index_col].map(str) + '_' + res[hash(
                'session')]

        res.drop(columns=[hash('session')], inplace=True)
        if session_col is None and session_col_arg in res.columns:
            res.drop(columns=[session_col_arg], inplace=True)
        return res
Пример #30
0
    def test_get_loc(self):
        idx = pd.date_range('2000-01-01', periods=3)

        for method in [None, 'pad', 'backfill', 'nearest']:
            self.assertEqual(idx.get_loc(idx[1], method), 1)
            self.assertEqual(idx.get_loc(idx[1].to_pydatetime(), method), 1)
            self.assertEqual(idx.get_loc(str(idx[1]), method), 1)
            if method is not None:
                self.assertEqual(
                    idx.get_loc(idx[1],
                                method,
                                tolerance=pd.Timedelta('0 days')), 1)

        self.assertEqual(idx.get_loc('2000-01-01', method='nearest'), 0)
        self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest'), 1)

        self.assertEqual(
            idx.get_loc('2000-01-01T12', method='nearest', tolerance='1 day'),
            1)
        self.assertEqual(
            idx.get_loc('2000-01-01T12',
                        method='nearest',
                        tolerance=pd.Timedelta('1D')), 1)
        self.assertEqual(
            idx.get_loc('2000-01-01T12',
                        method='nearest',
                        tolerance=np.timedelta64(1, 'D')), 1)
        self.assertEqual(
            idx.get_loc('2000-01-01T12',
                        method='nearest',
                        tolerance=timedelta(1)), 1)
        with tm.assertRaisesRegexp(ValueError, 'must be convertible'):
            idx.get_loc('2000-01-01T12', method='nearest', tolerance='foo')
        with tm.assertRaises(KeyError):
            idx.get_loc('2000-01-01T03', method='nearest', tolerance='2 hours')

        self.assertEqual(idx.get_loc('2000', method='nearest'), slice(0, 3))
        self.assertEqual(idx.get_loc('2000-01', method='nearest'), slice(0, 3))

        self.assertEqual(idx.get_loc('1999', method='nearest'), 0)
        self.assertEqual(idx.get_loc('2001', method='nearest'), 2)

        with tm.assertRaises(KeyError):
            idx.get_loc('1999', method='pad')
        with tm.assertRaises(KeyError):
            idx.get_loc('2001', method='backfill')

        with tm.assertRaises(KeyError):
            idx.get_loc('foobar')
        with tm.assertRaises(TypeError):
            idx.get_loc(slice(2))

        idx = pd.to_datetime(['2000-01-01', '2000-01-04'])
        self.assertEqual(idx.get_loc('2000-01-02', method='nearest'), 0)
        self.assertEqual(idx.get_loc('2000-01-03', method='nearest'), 1)
        self.assertEqual(idx.get_loc('2000-01', method='nearest'), slice(0, 2))

        # time indexing
        idx = pd.date_range('2000-01-01', periods=24, freq='H')
        tm.assert_numpy_array_equal(idx.get_loc(time(12)), [12])
        tm.assert_numpy_array_equal(idx.get_loc(time(12, 30)), [])
        with tm.assertRaises(NotImplementedError):
            idx.get_loc(time(12, 30), method='pad')