def start( self, dirs, output_dir, analysis_start_date, analysis_end_date, analysis_timespan, cell_execution_timeout, make_configs, backend, ): ''' Initiate new project. No files will be touched! Parameters ---------- dirs: list, optional List of sub-directory names that should be used in the project. By default all subdirectories defined in the contructor are taken into account. ''' # set ouput_dir self.output_dir = output_dir if self.output_dir is None: self.output_dir = os.path.join('.', self.project_name) # set backend binary format to read/write dataframes self.backend = backend # analsysis timespan self.analysis_timespan = analysis_timespan if not isinstance(self.analysis_timespan, pd.Timedelta): try: self.analysis_timespan = pd.Timedelta(self.analysis_timespan) except Exception as e: logging.error(e) # analysis start date self.analysis_start_date = analysis_start_date if self.analysis_start_date is None: self.analysis_start_date = pd.datetime.today( ) - self.analysis_timespan # analysis end date # defaults to today self.analysis_end_date = analysis_end_date if self.analysis_end_date is None: self.analysis_end_date = pd.datetime.today() # re-calculate timespan as it might be wrong due to overwritten start or end date self.analysis_timespan = self.analysis_end_date - self.analysis_start_date # set the exec timeout of a single cell for notebooks execution self.cell_execution_timeout = cell_execution_timeout # set make_configs self.make_configs = make_configs # dict ot store successful execution dates self.execution_dates_make_configs = {} # init working directories for sub_dir in dirs: self.__dict__[sub_dir] = RdsFs( os.path.join(self.output_dir, sub_dir), nof_processes=self.nof_processes, backend=self.backend, ) # save project properties in defs self.__kwargs2defs() logging.info('Project "%s" created' % self.project_name) self._status('started') self.save()
def _get_good_sections(df, sample_period): """ Code copied from nilmtk[1]/nilmtk/stats/goodsections.py [1] https://github.com/nilmtk/nilmtk/ """ index = df.dropna().sort_index().index df_time_end = df.index[-1] + pd.Timedelta(seconds=sample_period) del df if len(index) < 2: return [] timedeltas_sec = timedelta64_to_secs(np.diff(index.values)) timedeltas_check = timedeltas_sec <= sample_period # Memory management del timedeltas_sec gc.collect() timedeltas_check = np.concatenate([[False], timedeltas_check]) transitions = np.diff(timedeltas_check.astype(np.int)) # Memory management last_timedeltas_check = timedeltas_check[-1] del timedeltas_check gc.collect() good_sect_starts = list(index[:-1][transitions == 1]) good_sect_ends = list(index[:-1][transitions == -1]) # Memory management last_index = index[-1] del index gc.collect() # Work out if this chunk ends with an open ended good section if len(good_sect_ends) == 0: ends_with_open_ended_good_section = (len(good_sect_starts) > 0) elif len(good_sect_starts) > 0: # We have good_sect_ends and good_sect_starts ends_with_open_ended_good_section = (good_sect_ends[-1] < good_sect_starts[-1]) else: # We have good_sect_ends but no good_sect_starts ends_with_open_ended_good_section = False if ends_with_open_ended_good_section: good_sect_ends += [df_time_end] assert len(good_sect_starts) == len(good_sect_ends) sections = [ TimeFrame(start, end) for start, end in zip(good_sect_starts, good_sect_ends) if not (start == end and start is not None) ] # Memory management del good_sect_starts del good_sect_ends gc.collect() return sections
def getbc_intraday(symbol, start=None, end=None, minutes=5, showUrl=False, key=None): ''' Note that getHistory will return previous day's prices until 15 minutes after the market closes. We will generate a warning if our start or end date differ from the date of the response. Given todays date at 14:00, it will retrive the previous business days stuff. Given not start parameter, we will return data for the last weekday. Today or earlier. We will return everything we between start and end. It may be incomplete. Its now limiting yesterdays data. At 3:00, the latest I get is yesterday up to 12 noon. Retrieve candle data measured in minutes as given in the minutes parameter :params start: A datetime object or time string to indicate the begin time for the data. By default, start will be set to the most recent weekday at market open. :params end: A datetime object or time string to indicate the end time for the data :params minutes: An int for the candle time, 5 minute, 15 minute etc :return (status, data): A tuple of (status as dictionary, data as a DataFrame ) This status is seperate from request status_code. :raise: ValueError if response.status_code is not 200. ''' if getLimitReached('bc'): msg = 'BarChart limit was reached' logging.info(msg) return {'code': 666, 'message': msg}, pd.DataFrame(), None logging.info( '======= Called Barchart -- 150 call limit, data available after market close =======' ) if not end: tdy = dt.datetime.today() end = dt.datetime(tdy.year, tdy.month, tdy.day, 17, 0) # end if not start: tdy = dt.datetime.today() start = dt.datetime(tdy.year, tdy.month, tdy.day, 6, 0) start = getLastWorkDay(start) end = pd.to_datetime(end) start = pd.to_datetime(start) # startDay = start.strftime("%Y%m%d") # Get the maximum data in order to set the 200 MA on a 60 minute chart fullstart = pd.Timestamp.today() fullstart = fullstart - pd.Timedelta(days=40) fullstart = fullstart.strftime("%Y%m%d") params = setParams(symbol, minutes, fullstart, key=key) response = requests.get(BASE_URL, params=params) if showUrl: logging.info(response.url) if response.status_code != 200: raise Exception( f"{response.status_code}: {response.content.decode('utf-8')}") meta = {'code': 200} if (response.text and isinstance(response.text, str) and response.text.startswith('You have reached')): d = pd.Timestamp.now() dd = pd.Timestamp(d.year, d.month, d.day + 1, 3, 0, 0) setLimitReached('bc', dd) logging.warning(f'API max queries: {response.text}') meta['message'] = response.text return meta, pd.DataFrame(), None result = response.json() if not result['results']: logging.warning( '''Failed to retrieve any data. Barchart sends the following greeting: {result['status']}''' ) return result['status'], pd.DataFrame(), None meta['message'] = result['status']['message'] df = pd.DataFrame(result['results']) for i, row in df.iterrows(): d = pd.Timestamp(row['timestamp']) newd = pd.Timestamp(d.year, d.month, d.day, d.hour, d.minute, d.second) df.at[i, 'timestamp'] = newd df.set_index(df.timestamp, inplace=True) df.index.rename('date', inplace=True) maDict = movingAverage(df.close, df, start) if start > df.index[0]: rstart = df.index[0] rend = df.index[-1] df = df.loc[df.index >= start] for ma in maDict: maDict[ma] = maDict[ma].loc[maDict[ma].index >= start] lendf = len(df) if lendf == 0: msg = '\nWARNING: all data has been removed.' msg = msg + f'\nThe Requested start was({start}).' msg = msg + f'\nBarchart returned data beginning {rstart} and ending {rend}' msg += '''If you are seeking a chart from today, its possible Barchart has not made''' msg += 'the data available yet. (Should be available by 4:45PM but they are occasionally late)' msg += 'You can copy the image yourself, wait, or try a different API. Open File->StockAPI' logging.warning(msg) meta['code2'] = 199 meta['message'] = meta['message'] + msg return meta, df, maDict if end < df.index[-1]: df = df.loc[df.index <= end] for ma in maDict: maDict[ma] = maDict[ma].loc[maDict[ma].index <= end] # If we just sliced off all our data. Set warning message lendf = len(df) if lendf == 0: msg = '\nWARNING: all data has been removed.' msg = msg + f'\nThe Requested end was({end}).' meta['code2'] = 199 meta['message'] = meta['message'] + msg logging.warning(f'{meta}') return meta, df, maDict deleteMe = list() for key in maDict: if key == 'vwap': continue if len(df) != len(maDict[key]): deleteMe.append(key) for key in deleteMe: del maDict[key] # Note we are dropping columns ['symbol', 'timestamp', 'tradingDay[] in favor of ohlcv df = df[['open', 'high', 'low', 'close', 'volume']].copy(deep=True) return meta, df, maDict
class TestNumericArraylikeArithmeticWithTimedeltaLike(object): # TODO: also check name retentention @pytest.mark.parametrize('box_cls', [np.array, pd.Index, pd.Series]) @pytest.mark.parametrize('left', [ pd.RangeIndex(10, 40, 10)] + [cls([10, 20, 30], dtype=dtype) for dtype in ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8', 'f2', 'f4', 'f8'] for cls in [pd.Series, pd.Index]], ids=lambda x: type(x).__name__ + str(x.dtype)) def test_mul_td64arr(self, left, box_cls): # GH#22390 right = np.array([1, 2, 3], dtype='m8[s]') right = box_cls(right) expected = pd.TimedeltaIndex(['10s', '40s', '90s']) if isinstance(left, pd.Series) or box_cls is pd.Series: expected = pd.Series(expected) result = left * right tm.assert_equal(result, expected) result = right * left tm.assert_equal(result, expected) # TODO: also check name retentention @pytest.mark.parametrize('box_cls', [np.array, pd.Index, pd.Series]) @pytest.mark.parametrize('left', [ pd.RangeIndex(10, 40, 10)] + [cls([10, 20, 30], dtype=dtype) for dtype in ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8', 'f2', 'f4', 'f8'] for cls in [pd.Series, pd.Index]], ids=lambda x: type(x).__name__ + str(x.dtype)) def test_div_td64arr(self, left, box_cls): # GH#22390 right = np.array([10, 40, 90], dtype='m8[s]') right = box_cls(right) expected = pd.TimedeltaIndex(['1s', '2s', '3s']) if isinstance(left, pd.Series) or box_cls is pd.Series: expected = pd.Series(expected) result = right / left tm.assert_equal(result, expected) result = right // left tm.assert_equal(result, expected) with pytest.raises(TypeError): left / right with pytest.raises(TypeError): left // right # TODO: de-duplicate with test_numeric_arr_mul_tdscalar def test_ops_series(self): # regression test for G#H8813 td = Timedelta('1 day') other = pd.Series([1, 2]) expected = pd.Series(pd.to_timedelta(['1 day', '2 days'])) tm.assert_series_equal(expected, td * other) tm.assert_series_equal(expected, other * td) # TODO: also test non-nanosecond timedelta64 and Tick objects; # see test_numeric_arr_rdiv_tdscalar for note on these failing @pytest.mark.parametrize('scalar_td', [ Timedelta(days=1), Timedelta(days=1).to_timedelta64(), Timedelta(days=1).to_pytimedelta()], ids=lambda x: type(x).__name__) def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box): # GH#19333 index = numeric_idx expected = pd.timedelta_range('0 days', '4 days') index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) result = index * scalar_td tm.assert_equal(result, expected) commute = scalar_td * index tm.assert_equal(commute, expected) def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box): if box is not pd.Index and isinstance(three_days, pd.offsets.Tick): raise pytest.xfail("Tick division not implemented") index = numeric_idx[1:3] expected = TimedeltaIndex(['3 Days', '36 Hours']) index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) result = three_days / index tm.assert_equal(result, expected) with pytest.raises(TypeError): index / three_days @pytest.mark.parametrize('other', [ pd.Timedelta(hours=31), pd.Timedelta(hours=31).to_pytimedelta(), pd.Timedelta(hours=31).to_timedelta64(), pd.Timedelta(hours=31).to_timedelta64().astype('m8[h]'), np.timedelta64('NaT'), np.timedelta64('NaT', 'D'), pd.offsets.Minute(3), pd.offsets.Second(0)]) def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box): left = tm.box_expected(numeric_idx, box) with pytest.raises(TypeError): left + other with pytest.raises(TypeError): other + left with pytest.raises(TypeError): left - other with pytest.raises(TypeError): other - left
def prev(self, *arg, **kwarg): """Load the previous orbit into .data. Note ---- Forms complete orbits across day boundaries. If no data loaded then the last orbit of data from the last day is loaded into .data. """ # first, check if data exists if not self.sat.empty: # set up orbit metadata self._calcOrbits() # if not close to the first orbit,just pull the previous orbit if (self._current > 2) & (self._current <= self.num): # load orbit and put it into self.sat.data self._getBasicOrbit(orbit=self._current - 1) print('Loaded Orbit:%i' % (self._current - 1)) # if current orbit near the first, must be careful elif self._current == 2: # first, load prev orbit data self._getBasicOrbit(orbit=self._current - 1) load_prev = True if self.sat._iter_type == 'date': delta = self.sat.index[-1] - self.sat.date if delta >= self.orbit_period: # don't need to load the prev day because this orbit # ends more than a orbital period from start of today's # date load_prev = False if load_prev: # need to save this current orbit and load the prev day temp_orbit_data = self.sat[self.sat.date:] # load previous day, which clears orbit breaks info try: self.sat.prev() # combine this next day orbit with previous last orbit if not self.sat.empty: self.sat.data = \ self.sat.concat_data([self.sat.data, temp_orbit_data]) # select first orbit of combined data self._getBasicOrbit(orbit=-1) else: self.sat.next() self._getBasicOrbit(orbit=1) except StopIteration: # if loading the first orbit, of first day of data, # you'll end up here as the attempt to make a full # orbit will move the date backwards, and StopIteration # is made. everything is already ok, just move along pass del temp_orbit_data print('Loaded Orbit:%i' % (self._current - 1)) elif self._current == 0: self.load(orbit=-1) return elif self._current < 2: # first, load prev orbit data self._getBasicOrbit(orbit=1) # need to save this current orbit and load the prev day temp_orbit_data = self.sat[self.sat.date:] # load previous day, which clears orbit breaks info self.sat.prev() # combine this next day orbit with previous last orbit if not self.sat.empty: load_prev = True if self.sat._iter_type == 'date': delta = self.sat.date - self.sat.index[-1] \ + pds.Timedelta('1 day') if delta >= self.orbit_period: # don't need to load the prev day because this # orbit ends more than a orbital period from start # of today's date load_prev = False if load_prev: self.sat.data = self.sat.concat_data([self.sat.data, temp_orbit_data]) # select second to last orbit of combined data self._getBasicOrbit(orbit=-2) else: # padding from the previous is needed self._getBasicOrbit(orbit=-1) if self.sat._iter_type == 'date': delta = self.sat.date - self.sat.index[-1] \ + pds.Timedelta('1 day') if delta < self.orbit_period: self._current = self.num self.prev() else: while self.sat.empty: self.sat.prev() self._getBasicOrbit(orbit=-1) del temp_orbit_data print('Loaded Orbit:%i' % (self._current - 1)) else: raise Exception(' '.join(('You ended up where nobody should', 'ever be. Talk to someone about', 'this fundamental failure or open', 'an issue at', 'www.github.com/rstonback/pysat'))) # includes hack to appear to be zero indexed else: # no data while self.sat.empty: self.sat.prev() # raises stopIteration at end of dataset self.prev()
ty.index = pd.to_datetime(ty.index) under_fut = data['under_fut'] #fill in the calendar of the expiry dates in Python c = calendar.Calendar(firstweekday=calendar.SATURDAY) s_exp_dt = pd.Series(index=under_fut.index, name='OPT_EXP_DT') for row in under_fut.itertuples(): contract = row.Index del_dt = row.FUT_DLV_DT_FIRST monthcal = c.monthdatescalendar(del_dt.year, del_dt.month-1) s_exp_dt.loc[contract] = monthcal[3][-1] #fourth friday of the month under_fut[s_exp_dt.name] = pd.to_datetime(s_exp_dt) ty = ty.merge(under_fut, right_on='OPT_EXP_DT',left_on='ticker',right_index=True) #assign the time to expiry ty['opt_tau_act365'] = (ty['OPT_EXP_DT'] - ty.index) / pd.Timedelta('365 days') ty.loc[ty['OPT_EXP_DT'] < ty.index, 'opt_tau_act365'] = 0 # do some data correction: remove some days where the vols are negative and unusable ty.drop(labels=DATES_TO_KILL, errors='ignore', inplace=True) put_ivols = ['put_10d', 'put_25d', 'put_40d', 'put_50d', 'put_60d', 'put_75d', 'put_90d', 'hist_put_ivol'] call_ivols = ['call_90d', 'call_75d', 'call_60d', 'call_50d', 'call_40d', 'call_25d', 'call_10d', 'hist_call_ivol'] dt = ty.index.intersection(DATES_CALLS_TO_PUTS) ty.loc[dt,put_ivols] = ty.loc[dt,call_ivols].values # summarise the volatilities: # note that the average does not work as the in the money option implied # vols are some times wrong.... ty['atm_ivol'] = (ty['put_50d'] + ty['call_50d']) / 2 / 100 ty['10dp_ivol'] = ty['put_10d'] / 100
def wait_time(self): return pd.Timedelta(milliseconds=300)
import bokeh.models as bm import bokeh.plotting as bp import datetime as dt import numpy as np import pandas as pd from pathlib import Path data_path = Path.cwd() / '../data' columns = ['time', 'open', 'close', 'high', 'low', 'volume'] candles = {col: [] for col in columns} for filename in sorted(list(data_path.glob('SSO_15*'))): o = pd.read_csv(filename) o = o.set_index(pd.DatetimeIndex(o['time'])) freq = pd.Timedelta(hours=1) g = o.groupby(pd.Grouper(freq=freq)) for name, group in g: if group.values.size != 0: open_v, close_v = group.price[-1], group.price[0] high_v, low_v = group.price.agg([np.max, np.min]) vol_v = group.volume.sum() candles["time"].append((name).strftime('%H:%M %d/%m-%y')) candles["open"].append(open_v) candles["close"].append(close_v) candles["high"].append(high_v) candles["low"].append(low_v) candles["volume"].append(vol_v) else: candles["time"].append(None) candles["open"].append(None)
def test_applymap_str(): # GH 2786 df = DataFrame(np.random.random((3, 4))) df2 = df.copy() cols = ["a", "a", "a", "a"] df.columns = cols expected = df2.applymap(str) expected.columns = cols result = df.applymap(str) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "col, val", [["datetime", Timestamp("20130101")], ["timedelta", pd.Timedelta("1 min")]], ) def test_applymap_datetimelike(col, val): # datetime/timedelta df = DataFrame(np.random.random((3, 4))) df[col] = val result = df.applymap(str) assert result.loc[0, col] == str(df.loc[0, col]) @pytest.mark.parametrize( "expected", [ DataFrame(), DataFrame(columns=list("ABC")), DataFrame(index=list("ABC")),
def str_to_timedelta(f): return pd.Timedelta(to_offset(f)).to_pytimedelta()
def load_data(): data_df = pd.DataFrame([]) for file in ['confirmed', 'deaths']: file_df = pd.read_csv(remote_path( 'raw/paho/{}.timeline.csv'.format(file)), index_col=[0], header=[0, 1]) try: file_patch_df = pd.read_csv(remote_path( 'raw/paho/{}.timeline.daily.patch.csv'.format(file)), index_col=[0], header=[0, 1]) file_df.update(file_patch_df) except pd.io.parsers.EmptyDataError: pass file_df.columns.names = ['', ''] file_df.index.name = '' file_df = file_df['BOL'] file_df = file_df.rename(ADM1_NAME, axis=1) file_df.index = pd.to_datetime(file_df.index) file_df.index = file_df.index - pd.Timedelta(days=1) file_df = file_df[COLUMNS_ORDER] file_df.columns = pd.MultiIndex.from_product([[CASES_DATA_NAME[file]], file_df.columns]) # Errores en los datos file_df = file_df.astype(np.float64) file_df = file_df.drop_duplicates().asfreq('D') file_df = file_df.interpolate('from_derivatives', limit_area='inside') file_df[(file_df.diff() < 0).shift(-1).fillna(False)] = np.nan file_df = file_df.interpolate('from_derivatives', limit_area='inside') file_df[file_df.diff() < 0] = np.nan file_df = file_df.interpolate('from_derivatives', limit_area='inside') file_df = file_df.round().dropna(how='all') data_df = pd.concat([data_df, file_df], axis=1) data_df = data_df.sort_index() data_df = data_df.fillna(method='ffill') # Aqui se cambia la definicion de caso recuperado a todos los casos 14 dias # despues de ser diagnosticados (deberian ser 10?) active_cases = data_df['confirmados'].diff().rolling(window=14).sum() active_cases = active_cases.fillna(data_df['confirmados']) active_cases.columns = pd.MultiIndex.from_product([['activos'], active_cases.columns]) data_df = pd.concat([data_df, active_cases], axis=1) recovered_cases = data_df['confirmados'].shift(periods=14) recovered_cases = recovered_cases - data_df['decesos'] recovered_cases[recovered_cases < 0] = 0 recovered_cases.columns = pd.MultiIndex.from_product( [['recuperados'], recovered_cases.columns]) data_df = pd.concat([data_df, recovered_cases], axis=1) # Testing pending, discarded = load_testing_data() pending.columns = pd.MultiIndex.from_product([['sospechosos'], pending.columns]) data_df = pd.concat([data_df, pending], axis=1) discarded.columns = pd.MultiIndex.from_product([['descartados'], discarded.columns]) data_df = pd.concat([data_df, discarded], axis=1) data_df = data_df.rename( { 'confirmados': 'cases', 'decesos': 'death', 'activos': 'active_cases', 'recuperados': 'recovered', 'sospechosos': 'pending', 'descartados': 'discarded' }, axis=1) data_df = data_df.loc[:data_df['cases'].last_valid_index()] return data_df
def main(mytimer: func.TimerRequest) -> None: utc_timestamp = datetime.datetime.utcnow().replace( tzinfo=datetime.timezone.utc).isoformat() if mytimer.past_due: logging.info('The timer is past due!') params = urllib.parse.quote_plus( r'Driver={ODBC Driver 17 for SQL Server};Server=tcp:covid19dbserver.database.windows.net,1433;Database=covid19db;Uid=serveradmin@covid19dbserver;Pwd=pzaGuPujnkUnDqZFbWt5;Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;' ) conn_str = 'mssql+pyodbc:///?odbc_connect={}'.format(params) engine = create_engine(conn_str, echo=False) key_cols_candidates = [ 'Country/Region', 'Province/State', 'District', 'federalstate', 'date' ] training_period = 21 forecasting_days = 3 forecast_col = 'infections' for table_name in ['Hopkins', 'ECDC', 'HopkinsTS', 'RKI']: logging.info(f"Processing table {table_name}...") # for table_name in ['RKI']: df = pd.read_sql_table(table_name, engine) df = df.drop([ 'Province/State', 'District', 'FIPS', 'Lat', 'Long', 'deaths', 'recovered', 'ID' ], axis=1, errors='ignore') string_cols = df.dtypes[df.dtypes == 'object'].index string_col_replacement = {key: "None" for key in string_cols} df = df.fillna(string_col_replacement) key_cols = [col for col in key_cols_candidates if col in df.columns] df = df.groupby(by=key_cols).sum().reset_index() for col in string_cols: if col in df.columns: df.loc[df[col].str.contains("None"), col] = pd.NA country_col = 'Country/Region' if table_name == 'RKI': country_col = 'federalstate' for country in df[country_col].unique(): logging.debug(f"Computing forecasts for country {country}") df_country = df[df[country_col] == country] y = df_country.sort_values( by='date')[forecast_col].values[-training_period:] if (y < 10).all(): # All values < 10. No good forecast possible continue if len(y) < training_period: # Did not find a lot of datapoints continue x = range(len(y)) x_forecast = range(len(y), len(y) + forecasting_days) try: (a_scipy, b_scipy), _ = curve_fit(lambda t, a, b: a * np.exp(b * t), x, y) except Exception as e: logging.info(table_name, country, y) logging.error(e) continue def exp_scipy(x): return a_scipy * np.exp(b_scipy * x) y_forecast = exp_scipy(x_forecast) df_result = pd.DataFrame() df_result['forecast_infections'] = y_forecast one_day_delta = pd.Timedelta(value=1, unit='d') df_result['date'] = pd.date_range(start=df_country.date.max() + one_day_delta, periods=forecasting_days, freq='d') df_result['forecast_infections'] = y_forecast df_result[country_col] = country today = datetime.datetime.now() df_result['forecasting_date'] = today dtype_dict = {} for str_col in string_cols: if (str_col in df_result.columns and df_result[str_col].notnull().sum() > 0): # print(col) df_result.loc[df_result[str_col].notnull(), str_col] = df_result.loc[ df_result[str_col].notnull(), str_col].str.slice(start=0, stop=99) dtype_dict[str_col] = sqlalchemy.types.NVARCHAR(length=100) logging.debug("Computed forecasts.") logging.debug("Writing forecast to database...") df_result.to_sql(f"{table_name}_forecast", engine, if_exists='append', index=False, dtype=dtype_dict) logging.debug("Wrote forecast to database.") # TODO: Write merge statement to update into f{table_name}_forecast or just run once a day logging.info('Python timer trigger function ran at %s', utc_timestamp)
'datetime64[s]'), period=period) edges = (pd.DatetimeIndex([t_interval_start]).append(t_intervals_end[:-1]), pd.DatetimeIndex(t_intervals_end)) for i, probe in enumerate(probes): probe_name = f'{prefix}{probe:02}' # table name in db l.info('Draw %s in Veusz: %d intervals...', probe_name, edges[0].size) # for i_interval, (t_interval_start, t_interval_end) in enumerate(zip(pd.DatetimeIndex([t_interval_start]).append(t_intervals_end[:-1]), t_intervals_end), start=1): cfg_vp = {'veusze': None} for i_interval, (t_interval_start, t_interval_end) in enumerate(zip(*edges), start=1): # if i_interval < 23: #<= 0: # TEMPORARY Skip this number of intervals # continue if period != length: t_interval_start = t_interval_end - pd.Timedelta(dt_custom_s, 's') try: # skipping absent probes start_end = h5q_interval2coord( db_path=str(db_path), table=f'/{probe_name}', t_interval=(t_interval_start, t_interval_end)) if not len(start_end): break # no data except KeyError: break # device name not in specified range, go to next name pattern_path_new = pattern_path.with_name(f"{t_interval_start:%y%m%d_%H%M}_{length}_{probe_name}.vsz") # Modify pattern file if not b_images_only:
def test_indexing_with_datetime_tz(self): # 8260 # support datetime64 with tz idx = Index(date_range('20130101', periods=3, tz='US/Eastern'), name='foo') dr = date_range('20130110', periods=3) df = DataFrame({'A': idx, 'B': dr}) df['C'] = idx df.iloc[1, 1] = pd.NaT df.iloc[1, 2] = pd.NaT # indexing result = df.iloc[1] expected = Series([ Timestamp('2013-01-02 00:00:00-0500', tz='US/Eastern'), np.nan, np.nan ], index=list('ABC'), dtype='object', name=1) tm.assert_series_equal(result, expected) result = df.loc[1] expected = Series([ Timestamp('2013-01-02 00:00:00-0500', tz='US/Eastern'), np.nan, np.nan ], index=list('ABC'), dtype='object', name=1) tm.assert_series_equal(result, expected) # indexing - fast_xs df = DataFrame({'a': date_range('2014-01-01', periods=10, tz='UTC')}) result = df.iloc[5] expected = Timestamp('2014-01-06 00:00:00+0000', tz='UTC', freq='D') assert result == expected result = df.loc[5] assert result == expected # indexing - boolean result = df[df.a > df.a[3]] expected = df.iloc[4:] tm.assert_frame_equal(result, expected) # indexing - setting an element df = DataFrame(data=pd.to_datetime( ['2015-03-30 20:12:32', '2015-03-12 00:11:11']), columns=['time']) df['new_col'] = ['new', 'old'] df.time = df.set_index('time').index.tz_localize('UTC') v = df[df.new_col == 'new'].set_index('time').index.tz_convert( 'US/Pacific') # trying to set a single element on a part of a different timezone # this converts to object df2 = df.copy() df2.loc[df2.new_col == 'new', 'time'] = v expected = Series([v[0], df.loc[1, 'time']], name='time') tm.assert_series_equal(df2.time, expected) v = df.loc[df.new_col == 'new', 'time'] + pd.Timedelta('1s') df.loc[df.new_col == 'new', 'time'] = v tm.assert_series_equal(df.loc[df.new_col == 'new', 'time'], v)
'dataset': dataset, 'grid_res': 2.5, 'startyear': 1979, # download startyear 'endyear': 2020, # download endyear 'months': list(range(1, 12 + 1)), #downoad months # for monthly means of daily means, choose 'moda' or 'mnth' # for daily means choose 'oper' or 'enda' (for accumulations) 'stream': 'oper', 'time': pd.date_range(start='00:00', end='23:00', freq=(pd.Timedelta(3, unit='h'))), 'area': 'global', # [North, West, South, East]. Default: global 'CDO_command': 'daymean', 'base_path': base_path, 'path_raw': path_raw }) if ex['dataset'] == 'ERAint' or ex['dataset'] == 'era20c': import download_ERA_interim_API as ECMWF elif ex['dataset'] == 'era5': import download_ERA5_API as ECMWF
def dates_to_idx(timelist): reference_time = pd.to_datetime('1958-03-15') t = (timelist - reference_time) / pd.Timedelta(1, "Y") return np.asarray(t)
def test_examples2(self): """ doc-string examples """ trades = pd.DataFrame( { 'time': pd.to_datetime([ '20160525 13:30:00.023', '20160525 13:30:00.038', '20160525 13:30:00.048', '20160525 13:30:00.048', '20160525 13:30:00.048' ]), 'ticker': ['MSFT', 'MSFT', 'GOOG', 'GOOG', 'AAPL'], 'price': [51.95, 51.95, 720.77, 720.92, 98.00], 'quantity': [75, 155, 100, 100, 100] }, columns=['time', 'ticker', 'price', 'quantity']) quotes = pd.DataFrame( { 'time': pd.to_datetime([ '20160525 13:30:00.023', '20160525 13:30:00.023', '20160525 13:30:00.030', '20160525 13:30:00.041', '20160525 13:30:00.048', '20160525 13:30:00.049', '20160525 13:30:00.072', '20160525 13:30:00.075' ]), 'ticker': [ 'GOOG', 'MSFT', 'MSFT', 'MSFT', 'GOOG', 'AAPL', 'GOOG', 'MSFT' ], 'bid': [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], 'ask': [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03] }, columns=['time', 'ticker', 'bid', 'ask']) pd.merge_asof(trades, quotes, on='time', by='ticker') pd.merge_asof(trades, quotes, on='time', by='ticker', tolerance=pd.Timedelta('2ms')) expected = pd.DataFrame( { 'time': pd.to_datetime([ '20160525 13:30:00.023', '20160525 13:30:00.038', '20160525 13:30:00.048', '20160525 13:30:00.048', '20160525 13:30:00.048' ]), 'ticker': ['MSFT', 'MSFT', 'GOOG', 'GOOG', 'AAPL'], 'price': [51.95, 51.95, 720.77, 720.92, 98.00], 'quantity': [75, 155, 100, 100, 100], 'bid': [np.nan, 51.97, np.nan, np.nan, np.nan], 'ask': [np.nan, 51.98, np.nan, np.nan, np.nan] }, columns=['time', 'ticker', 'price', 'quantity', 'bid', 'ask']) result = pd.merge_asof(trades, quotes, on='time', by='ticker', tolerance=pd.Timedelta('10ms'), allow_exact_matches=False) assert_frame_equal(result, expected)
plt.pause(2) df_path = pd.read_csv('../spencers_data/path1.csv') df_path['Start_time'] = pd.to_datetime(df_path['Start_time']) df_path['Start_time'] = df_path['Start_time'].apply( lambda x: x.strftime('%Y-%m-13 %H:%M:%S')) df_path['Start_time'] = pd.to_datetime(df_path['Start_time']) count = len(df_path) for index, row in df_path.iterrows(): if (row['Duration (min)'] == 2): df_path.loc[index, 'Duration (min)'] = 1 df_path.loc[count, :] = df_path.loc[index, :] df_path.loc[count, 'Start_time'] = df_path.loc[count, 'Start_time'] + pd.Timedelta('1 minute') count += 1 df_path.sort_values('Start_time', inplace=True) df_path.reset_index(drop=True, inplace=True) df_path['Start_time'] = df_path['Start_time'] + pd.Timedelta('12 hours') # print (df_path) f = ['micromax', 'moto', 'oneplus', 'samsung', 'yureka'] for file in f: count = 0 df_loc_track = pd.read_csv( '../spencers_data/device_modified_logs_min/' + str(file) + '.csv') df_loc_track['ts'] = pd.to_datetime(df_loc_track['ts']) df_loc_track['ts'] = df_loc_track['ts'] + \
import pandas as pd from six.moves.urllib.parse import urlencode from catalyst.data.bundles.core import register_bundle from catalyst.data.bundles.base_pricing import BaseEquityPricingBundle from catalyst.utils.memoize import lazyval """ Module for building a complete daily dataset from Quandl's WIKI dataset. """ from logbook import Logger from catalyst.constants import LOG_LEVEL from catalyst.utils.calendars import register_calendar_alias log = Logger(__name__, level=LOG_LEVEL) seconds_per_call = (pd.Timedelta('10 minutes') / 2000).total_seconds() class QuandlBundle(BaseEquityPricingBundle): @lazyval def name(self): return 'quandl' @lazyval def exchange(self): return 'QUANDL' @lazyval def frequencies(self): return set(('daily', ))
def preprocessing(gross, social): """ In this function, we merge two datasets together for further visualization tasks :param gross: the cleaned Broadway Grosses Data Set :param social: the cleaned Broadway Social Stats Data Set :return: the merged data set """ # Match the dates from two data sets gross_date = [x for x in gross['week_ending'].unique() if x[0:4] in ['2019', '2018', '2017']] temp_date = [x.date().strftime('%Y-%m-%d') for x in list(pd.to_datetime(gross_date) + pd.Timedelta(1, unit='d'))] gross_date = gross_date + temp_date names = list(social.columns) names.append('this_week_gross') df = pd.DataFrame(columns=names) for i in social['Date'].unique(): temp = pd.to_datetime(i).date().strftime('%Y-%m-%d') if temp in gross_date: temp_df = social.loc[social['Date'] == temp, :] c = gross.loc[(gross['week_ending'] == temp), ['week_ending', 'show', 'this_week_gross']] for j in temp_df.Show: for k in c.show: if j in k: temp_df.loc[(temp_df['Show'] == j), 'this_week_gross'] = c.loc[ c['show'] == k, 'this_week_gross'].values elif k in j: temp_df.loc[(temp_df['Show'] == j), 'this_week_gross'] = c.loc[ c['show'] == k, 'this_week_gross'].values df = df.append(temp_df, ignore_index=True) df_notnull = df.dropna() return df_notnull
def setup_perioddata_group(start_date_time, end_date_time=None, nper=1, perlen=None, model_time_units=None, freq=None, steady={0: True, 1: False}, nstp=10, tsmult=1.5, oc_saverecord={0: ['save head last', 'save budget last']}, ): """Sets up time discretization for a model; outputs a DataFrame with stress period dates/times and properties. Stress periods can be established with an established explicitly by specifying perlen as a list of period lengths in model units. Or, stress periods can be established using three of the start_date, end_date_time, nper, and freq arguments, similar to the pandas.date_range function. (see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html) Parameters ---------- start_date_time_time : str or datetime-like Left bound for generating stress period dates. See pandas documenation. end_date_time : str or datetime-like, optional Right bound for generating stress period dates. See pandas documenation. nper : int, optional Number of stress periods. Only used if perlen is None, or in combination with freq if an end_date_time isn't specified. perlen : sequence or None, optional A list of stress period lengths in model time units. Or specify as None and specify 3 of start_date_time, end_date_time, nper and/or freq. model_time_units : str, optional 'days' or 'seconds'. freq : str or DateOffset, default None For setting up uniform stress periods between a start and end date, or of length nper. Same as argument to pandas.date_range. Frequency strings can have multiples, e.g. ‘6MS’ for a 6 month interval on the start of each month. See the pandas documentation for a list of frequency aliases. Note: Only "start" frequences (e.g. MS vs M for "month end") are supported. steady : dict Dictionary with zero-based stress periods as keys and boolean values. Similar to MODFLOW-6 input, the information specified for a period will continue to apply until information for another period is specified. nstp : int or sequence Number of timesteps in a stress period. Must be an integer if perlen=None. nstp : int or sequence Timestep multiplier for a stress period. Must be an integer if perlen=None. oc_saverecord : dict Dictionary with zero-based stress periods as keys and output control options as values. Similar to MODFLOW-6 input, the information specified for a period will continue to apply until information for another perior is specified. Returns ------- perrioddata : pandas.DataFrame DataFrame summarizing stress period information. Data columns: ================== ================ ============================================== **start_datetime** pandas datetimes start date/time of each stress period (does not include steady-state periods) **end_datetime** pandas datetimes end date/time of each stress period (does not include steady-state periods) **time** float cumulative MODFLOW time at end of period (includes steady-state periods) **per** int zero-based stress period **perlen** float stress period length in model time units **nstp** int number of timesteps in the stress period **tsmult** int timestep multiplier for stress period **steady** bool True=steady-state, False=Transient **oc** dict MODFLOW-6 output control options ================== ================ ============================================== """ # todo: refactor/simplify setup_perioddata_group freq = convert_freq_to_period_start(freq) oc = oc_saverecord if not isinstance(steady, dict): steady = {i: v for i, v in enumerate(steady)} txt = "Specify perlen as a list of lengths in model units, or\nspecify 3 " \ "of start_date_time, end_date_time, nper and/or freq." # Explicitly specified stress period lengths if perlen is not None: if np.isscalar(perlen): perlen = [perlen] datetimes = [pd.Timestamp(start_date_time)] if len(perlen) > 1: for i, length in enumerate(perlen[1:]): datetimes.append(datetimes[i] + pd.Timedelta(length, unit=model_time_units)) time = np.cumsum(perlen) # time in MODFLOW units elif nper == 1 and steady[0]: perlen = [1] time = [1] #datetimes = [pd.Timestamp(start_date_time)] # Set up datetimes based on 3 of start_date_time, end_date_time, nper and/or freq (scalar perlen) else: assert np.isscalar(nstp), "nstp: {}; nstp must be a scalar if perlen " \ "is not specified explicitly as a list.\n{}".format(nstp, txt) assert np.isscalar(tsmult), "tsmult: {}; tsmult must be a scalar if perlen " \ "is not specified explicitly as a list.\n{}".format(tsmult, txt) periods = None if end_date_time is None: # start_date_time, periods and freq # (i.e. nper periods of length perlen starting on stat_date) if freq is not None: periods = nper else: raise ValueError("Unrecognized input for perlen: {}.\n{}".format(perlen, txt)) else: # end_date_time and freq and periods if start_date_time is None: periods = nper + 1 # start_date_time, end_date_time and (linearly spaced) periods # (i.e. nper periods of uniform length between start_date_time and end_date_time) elif freq is None: periods = nper #-1 if steady[0] else nper # start_date_time, end_date_time and frequency elif freq is not None: pass datetimes = pd.date_range(start_date_time, end_date_time, periods=periods, freq=freq) if start_date_time is None: start_date_time = datetimes[0] # in case end_date_time, periods and freq were specified if len(datetimes) == 1: perlen = [(pd.Timestamp(end_date_time) - pd.Timestamp(start_date_time)).days] time = np.array(perlen) else: # time is at the end of each stress period time = getattr((datetimes - pd.Timestamp(start_date_time)), model_time_units).tolist() # get the last (end) time, if it wasn't included in datetimes if datetimes[0] == pd.Timestamp(start_date_time) and nper is None: if end_date_time is not None: # + 1 for consistency with using date_range below # e.g. to end at 2019-01-01 instead of 2018-12-31 last_time = getattr((pd.Timestamp(end_date_time) - pd.Timestamp(start_date_time)), model_time_units) + 1 else: end_datetimes = pd.date_range(start_date_time, periods=len(datetimes) + 1, freq=freq) last_time = getattr((end_datetimes[-1] - pd.Timestamp(start_date_time)), model_time_units) if last_time != time[-1]: time += [last_time] if time[0] != 0: time = [0] + list(time) perlen = np.diff(time) time = np.array(time[1:]) assert len(perlen) == len(time) # == len(datetimes) # if first period is steady-state, # insert it at the beginning of the generated range # this should only apply to cases where nper > 1 if steady[0]: #datetimes = [datetimes[0]] + datetimes.tolist() # datetimes[:-1].tolist() perlen = [1] + list(perlen) time = [1] + (time + 1).tolist() else: pass #datetimes = datetimes[:-1] #perlen = np.diff(time).tolist() #time = time[1:] perioddata = pd.DataFrame({#'datetime': datetimes, 'time': time, 'per': range(len(time)), 'perlen': np.array(perlen).astype(float), 'nstp': nstp, 'tsmult': tsmult, }) # specify steady-state or transient for each period, filling empty # periods with previous state (same logic as MF6 input) issteady = [steady[0]] for i in range(len(perioddata)): issteady.append(steady.get(i, issteady[i])) perioddata['steady'] = issteady[1:] # set up output control, using previous value to fill empty periods # (same as MF6) oclist = [None] for i in range(len(perioddata)): oclist.append(oc.get(i, oclist[i])) perioddata['oc'] = oclist[1:] # create start and end datetime columns; # correct the datetime to only increment for transient stress periods start_datetime = [pd.Timestamp(start_date_time)] end_datetime = [] for i, r in perioddata.iterrows(): if r.steady: end_datetime.append(start_datetime[i]) else: end_datetime.append(start_datetime[i] + pd.Timedelta(r.perlen, unit=model_time_units)) start_datetime.append(end_datetime[i]) perioddata['start_datetime'] = start_datetime[:-1] perioddata['end_datetime'] = end_datetime cols = ['start_datetime', 'end_datetime', 'time', 'per', 'perlen', 'nstp', 'tsmult', 'steady', 'oc'] #perioddata = perioddata.drop('datetime', axis=1)[cols] # correct nstp and tsmult to be 1 for steady-state periods perioddata.loc[perioddata.steady, 'nstp'] = 1 perioddata.loc[perioddata.steady, 'tsmult'] = 1 return perioddata
def lead(n): m = mod.isel(Time=n) m['start'] = m.start + pd.Timedelta(n, 'd') return m.resample(offset, 'start', how='mean').to_series()
def next(self, *arg, **kwarg): """Load the next orbit into .data. Note ---- Forms complete orbits across day boundaries. If no data loaded then the first orbit from the first date of data is returned. """ # first, check if data exists if not self.sat.empty: # set up orbit metadata self._calcOrbits() # if current orbit near the last, must be careful if self._current == (self.num - 1): # first, load last orbit data self._getBasicOrbit(orbit=-1) # End of orbit may occur on the next day load_next = True if self.sat._iter_type == 'date': delta = self.sat.date - self.sat.index[-1] \ + pds.Timedelta('1 day') if delta >= self.orbit_period: # don't need to load the next day because this orbit # ends more than a orbital period from the next date load_next = False if load_next: # the end of the user's desired orbit occurs tomorrow, need # to form a complete orbit save this current orbit, load # the next day, combine data, select the correct orbit temp_orbit_data = self.sat.copy() try: # loading next day/file clears orbit breaks info self.sat.next() if not self.sat.empty: # combine this next day's data with previous last # orbit, grab the first one final_val = self.sat.index[0] \ - pds.DateOffset(microseconds=1) self.sat.data = self.sat.concat_data( [temp_orbit_data[:final_val], self.sat.data]) self._getBasicOrbit(orbit=1) else: # no data, go back a day and grab the last orbit. # As complete as orbit can be self.sat.prev() self._getBasicOrbit(orbit=-1) except StopIteration: pass del temp_orbit_data # includes hack to appear to be zero indexed print('Loaded Orbit:%i' % (self._current - 1)) elif self._current == (self.num): # at the last orbit, need to be careful about getting the next # orbit save this current orbit and load the next day # temp_orbit_data = self.sat.data.copy() temp_orbit_data = self.sat.copy() # load next day, which clears orbit breaks info self.sat.next() # combine this next day orbit with previous last orbit to # ensure things are correct if not self.sat.empty: pad_next = True # check if data padding is really needed, only works when # loading by date if self.sat._iter_type == 'date': delta = self.sat.date - temp_orbit_data.index[-1] if delta >= self.orbit_period: # the end of the previous orbit is more than an # orbit away from today we don't have to worry # about it pad_next = False if pad_next: # orbit went across day break, stick old orbit onto new # data and grab second orbit (first is old) self.sat.data = self.sat.concat_data( [temp_orbit_data[:self.sat.index[0] - pds.DateOffset(microseconds=1)], self.sat.data]) # select second orbit of combined data self._getBasicOrbit(orbit=2) else: # padding from the previous orbit wasn't needed, can # just grab the first orbit of loaded data self._getBasicOrbit(orbit=1) if self.sat._iter_type == 'date': delta = self.sat.date + pds.DateOffset(days=1) \ - self.sat.index[0] if delta < self.orbit_period: # this orbits end occurs on the next day, # though we grabbed the first orbit, missing # data means the first available orbit in the # datais actually the last for the day. # Resetting to the second to last orbit and t # hen callingnext() will get the last orbit, # accounting for tomorrow's data as well. self._current = self.num - 1 self.next() else: # no data for the next day # continue loading data until there is some # nextData raises StopIteration when it reaches the end, # leaving this function while self.sat.empty: self.sat.next() self._getBasicOrbit(orbit=1) del temp_orbit_data # includes hack to appear to be zero indexed print('Loaded Orbit:%i' % (self._current - 1)) elif self._current == 0: # no current orbit set, grab the first one # using load command to specify the first orbit, which # automatically loads prev day if needed to form complete orbit self.load(orbit=1) elif self._current < (self.num - 1): # since we aren't close to the last orbit, just pull the next # orbit self._getBasicOrbit(orbit=self._current + 1) # includes hack to appear to be zero indexed print('Loaded Orbit:%i' % (self._current - 1)) else: raise Exception(' '.join(('You ended up where nobody should', 'ever be. Talk to someone about', 'this fundamental failure or open', 'an issue at', 'www.github.com/rstonback/pysat'))) else: # no data while self.sat.empty: # keep going until data is found # next raises stopIteration at end of data set, no more data # possible self.sat.next() # we've found data, grab the next orbit self.next()
def df_stability_metrics( df, time_axis, features=None, binning="auto", bin_specs=None, time_width=None, time_offset=0, var_dtype=None, reference_type="self", reference=None, window=10, shift=1, monitoring_rules=None, pull_rules=None, **kwargs, ): """Create a data stability monitoring html datastore for given pandas or spark dataframe. :param df: input pandas/spark dataframe to be profiled and monitored over time. :param str time_axis: name of datetime feature, used as time axis, eg 'date'. if True, will be auto-guessed. If time_axis is set or found, and if no features provided, features becomes: ['date:x', 'date:y', 'date:z'] etc. :param list features: columns to pick up from input data. (default is all features). For multi-dimensional histograms, separate the column names with a ':'. Example features list is: .. code-block:: python features = ['x', 'date', 'date:x', 'date:y', 'date:x:y'] :param str binning: default binning to revert to in case bin_specs not supplied. options are: "unit" or "auto", default is "auto". When using "auto", semi-clever binning is automatically done. :param dict bin_specs: dictionaries used for rebinning numeric or timestamp features. An example bin_specs dictionary is: .. code-block:: python bin_specs = {'x': {'bin_width': 1, 'bin_offset': 0}, 'y': {'num': 10, 'low': 0.0, 'high': 2.0}, 'x:y': [{}, {'num': 5, 'low': 0.0, 'high': 1.0}]} In the bin specs for x:y, x is not provided (here) and reverts to the 1-dim setting. The 'bin_width', 'bin_offset' notation makes an open-ended histogram (for that feature) with given bin width and offset. The notation 'num', 'low', 'high' gives a fixed range histogram from 'low' to 'high' with 'num' number of bins. :param time_width: bin width of time axis. str or number (ns). note: bin_specs takes precedence. (optional) .. code-block:: text Examples: '1w', 3600e9 (number of ns), anything understood by pd.Timedelta(time_width).value :param time_offset: bin offset of time axis. str or number (ns). note: bin_specs takes precedence. (optional) .. code-block:: text Examples: '1-1-2020', 0 (number of ns since 1-1-1970), anything parsed by pd.Timestamp(time_offset).value :param dict var_dtype: dictionary with specified datatype per feature. auto-guessed when not provided. :param reference_type: type or reference used for comparisons. Options [self, external, rolling, expanding]. default is 'self'. :param reference: reference dataframe or histograms. default is None :param int window: size of rolling window and/or trend detection. default is 10. :param int shift: shift of time-bins in rolling/expanding window. default is 1. :param dict monitoring_rules: monitoring rules to generate traffic light alerts. The default setting is: .. code-block:: python monitoring_rules = {"*_pull": [7, 4, -4, -7], "*_zscore": [7, 4, -4, -7], "[!p]*_unknown_labels": [0.5, 0.5, 0, 0]} Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the feature name in front. E.g. .. code-block:: python monitoring_rules = {"featureA:*_pull": [5, 3, -3, -5], "featureA:nan": [4, 1, 0, 0], "*_pull": [7, 4, -4, -7], "nan": [8, 1, 0, 0]} In case of multiple rules could apply for a feature's statistic, the most specific one applies. So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule for all other features. :param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report. Default is: .. code-block:: python pull_rules = {"*_pull": [7, 4, -4, -7]} This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean, and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean. Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. (The same string logic applies as for monitoring_rules.) :param kwargs: residual keyword arguments, passed on to stability_report() :return: dict with results of metrics pipeline """ # basic checks on presence of time_axis if not (isinstance(time_axis, str) and len(time_axis) > 0) and not ( isinstance(time_axis, bool) and time_axis ): raise TypeError("time_axis needs to be a filled string or set to True") if isinstance(time_axis, str) and time_axis not in df.columns: raise ValueError(f'time_axis "{time_axis}" not found in columns of dataframe.') if reference is not None and not isinstance(reference, dict): if isinstance(time_axis, str) and time_axis not in reference.columns: raise ValueError( f'time_axis "{time_axis}" not found in columns of reference dataframe.' ) if isinstance(time_axis, bool): time_axes = get_time_axes(df) num = len(time_axes) if num == 1: time_axis = time_axes[0] logger.info(f'Time-axis automatically set to "{time_axis}"') elif num == 0: raise RuntimeError( "No obvious time-axes found. Cannot generate stability report." ) else: raise RuntimeError( f"Found {num} time-axes: {time_axes}. Set *one* time_axis manually!" ) if features is not None: # by now time_axis is defined. ensure that all histograms start with it. if not isinstance(features, list): raise TypeError( "features should be list of columns (or combos) to pick up from input data." ) features = [ c if c.startswith(time_axis) else f"{time_axis}:{c}" for c in features ] # interpret time_width and time_offset if isinstance(time_width, (str, int, float)) and isinstance( time_offset, (str, int, float) ): if not isinstance(bin_specs, (type(None), dict)): raise RuntimeError("bin_specs object is not a dictionary") if bin_specs is None: bin_specs = {} if time_axis in bin_specs: raise RuntimeError( f'time-axis "{time_axis}" already found in binning specifications.' ) # convert time width and offset to nanoseconds time_specs = { "bin_width": float(pd.Timedelta(time_width).value), "bin_offset": float(pd.Timestamp(time_offset).value), } bin_specs[time_axis] = time_specs reference_hists = None if reference is not None: reference_type = "external" if isinstance(reference, dict): # 1. reference is dict of histograms # extract features and bin_specs from reference histograms reference_hists = reference features = list(reference_hists.keys()) bin_specs = get_bin_specs(reference_hists) else: # 2. reference is pandas or spark dataframe # generate histograms and return updated features, bin_specs, time_axis, etc. ( reference_hists, features, bin_specs, time_axis, var_dtype, ) = make_histograms( reference, features, binning, bin_specs, time_axis, var_dtype, ret_specs=True, ) # use the same features, bin_specs, time_axis, etc as for reference hists hists = make_histograms( df, features=features, binning=binning, bin_specs=bin_specs, time_axis=time_axis, var_dtype=var_dtype, ) # generate data stability report return stability_metrics( hists, reference_type, reference_hists, time_axis, window, shift, monitoring_rules, pull_rules, features, **kwargs, )
def test_timedelta_with_nulls(self): df = pd.DataFrame( {'test': [pd.Timedelta('1 day'), None, pd.Timedelta('3 day')]}) self._check_pandas_roundtrip(df, null_counts=[1, 1])
def test_str_to_timestamp_rounds_up(timestamp): offset = timestamp - pd.Timedelta(minutes=45) as_str = offset.isoformat() assert str_to_timestamp(as_str) == timestamp
def merge_tti_feature(df_base, df_feature, n_shift): df_feature['time'] = df_feature['time'].apply( lambda x: x + pd.Timedelta(minutes=10 * n_shift)) df_train_feat = pd.merge(df_base, df_feature, on='time', how='left') df_train_feat.interpolate(inplace=True) return df_train_feat
## Combine all name names = {'ckinyaname': ckinyaname, 'ckinyaname_sx': ckinyaname_sx, 'ckinyaname_cor': ckinyaname_cor, 'cothername': cothername, 'pkinyaname': pkinyaname, 'pkinyaname_sx': pkinyaname_sx, 'pkinyaname_cor': pkinyaname_cor, 'pothername': pothername} names = pd.DataFrame(names) data = pd.concat([data, names], axis=1) # ============================================================================= # Estimate DOB # ============================================================================= months = pd.Series([pd.Timedelta(month, 'M') for month in data['age_mo']]) data = data.assign(dob_est = data['patientvisitdate'] - months) # ============================================================================= # Centroids # ============================================================================= coorDict = {sector.upper(): (centroids.iloc[i,7], centroids.iloc[i,8]) for i, sector in enumerate(centroids.Name)} coorDict.update({'': (np.NaN, np.NaN)}) coords = [coorDict[sector] for sector in data.sector_clean] coords = pd.DataFrame(coords) coords.columns = ['sectLat', 'sectLong']
def split_sessions(self, *, by_event=None, thresh, eos_event=None, session_col='session_id'): session_col_arg = session_col or 'session_id' index_col = self.retention_config['user_col'] event_col = self.retention_config['event_col'] time_col = self.retention_config['event_time_col'] res = self._obj.copy() if by_event is None: res[time_col] = pd.to_datetime(res[time_col]) if thresh is None: # add end_of_session event at the end of each string res.sort_values(by=time_col, inplace=True, ascending=False) res[hash('session')] = res.groupby(index_col).cumcount() res_session_ends = res[(res[hash('session')] == 0)].copy() res_session_ends[event_col] = eos_event res_session_ends[time_col] = res_session_ends[ time_col] + pd.Timedelta(seconds=1) res = pd.concat([res, res_session_ends]) res.sort_values(by=time_col, inplace=True) else: # split sessions by time thresh: # drop end_of_session events if already present: if eos_event is not None: res = res[res[event_col] != eos_event].copy() res.sort_values(by=time_col, inplace=True) shift_res = res.groupby(index_col).shift(-1) time_delta = pd.to_datetime( shift_res[time_col]) - pd.to_datetime(res[time_col]) time_delta = time_delta.dt.total_seconds() # get boolean mapper for end_of_session occurrences eos_mask = time_delta > thresh # add session column: res[hash('session')] = eos_mask res[hash('session')] = res.groupby(index_col)[hash( 'session')].cumsum() res[hash('session')] = res.groupby(index_col)[hash( 'session')].shift(1).fillna(0).map(int).map(str) # add end_of_session event if specified: if eos_event is not None: tmp = res.loc[eos_mask].copy() tmp[event_col] = eos_event tmp[time_col] += pd.Timedelta(seconds=1) res = pd.concat([res, tmp], ignore_index=True) res = res.sort_values(time_col).reset_index(drop=True) res[session_col_arg] = res[index_col].map(str) + '_' + res[ hash('session')] else: # split sessions by event: res[hash('session')] = res[event_col] == by_event res[hash('session')] = res.groupby(index_col)[hash( 'session')].cumsum().fillna(0).map(int).map(str) res[session_col_arg] = res[index_col].map(str) + '_' + res[hash( 'session')] res.drop(columns=[hash('session')], inplace=True) if session_col is None and session_col_arg in res.columns: res.drop(columns=[session_col_arg], inplace=True) return res
def test_get_loc(self): idx = pd.date_range('2000-01-01', periods=3) for method in [None, 'pad', 'backfill', 'nearest']: self.assertEqual(idx.get_loc(idx[1], method), 1) self.assertEqual(idx.get_loc(idx[1].to_pydatetime(), method), 1) self.assertEqual(idx.get_loc(str(idx[1]), method), 1) if method is not None: self.assertEqual( idx.get_loc(idx[1], method, tolerance=pd.Timedelta('0 days')), 1) self.assertEqual(idx.get_loc('2000-01-01', method='nearest'), 0) self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest'), 1) self.assertEqual( idx.get_loc('2000-01-01T12', method='nearest', tolerance='1 day'), 1) self.assertEqual( idx.get_loc('2000-01-01T12', method='nearest', tolerance=pd.Timedelta('1D')), 1) self.assertEqual( idx.get_loc('2000-01-01T12', method='nearest', tolerance=np.timedelta64(1, 'D')), 1) self.assertEqual( idx.get_loc('2000-01-01T12', method='nearest', tolerance=timedelta(1)), 1) with tm.assertRaisesRegexp(ValueError, 'must be convertible'): idx.get_loc('2000-01-01T12', method='nearest', tolerance='foo') with tm.assertRaises(KeyError): idx.get_loc('2000-01-01T03', method='nearest', tolerance='2 hours') self.assertEqual(idx.get_loc('2000', method='nearest'), slice(0, 3)) self.assertEqual(idx.get_loc('2000-01', method='nearest'), slice(0, 3)) self.assertEqual(idx.get_loc('1999', method='nearest'), 0) self.assertEqual(idx.get_loc('2001', method='nearest'), 2) with tm.assertRaises(KeyError): idx.get_loc('1999', method='pad') with tm.assertRaises(KeyError): idx.get_loc('2001', method='backfill') with tm.assertRaises(KeyError): idx.get_loc('foobar') with tm.assertRaises(TypeError): idx.get_loc(slice(2)) idx = pd.to_datetime(['2000-01-01', '2000-01-04']) self.assertEqual(idx.get_loc('2000-01-02', method='nearest'), 0) self.assertEqual(idx.get_loc('2000-01-03', method='nearest'), 1) self.assertEqual(idx.get_loc('2000-01', method='nearest'), slice(0, 2)) # time indexing idx = pd.date_range('2000-01-01', periods=24, freq='H') tm.assert_numpy_array_equal(idx.get_loc(time(12)), [12]) tm.assert_numpy_array_equal(idx.get_loc(time(12, 30)), []) with tm.assertRaises(NotImplementedError): idx.get_loc(time(12, 30), method='pad')