def forecast_next(index_name,table_name, value_column, index_col, interface, averaging = 'last1', ahead = 1): """ Return the florcasted value in the past at the time range t1 to t2 for the value of column_name using index_name ---------- Parameters ---------- index_name: string name of the PINDEX used to query the prediction index_name: table_name name of the time series table in the database value_column: string name of column than contain time series value index_col: string name of column that contains time series index/timestamp interface: db_class object object used to communicate with the DB. see ../database/db_class for the abstract class averaging: string, optional, (default 'average') Coefficients used when forecasting, 'average' means use the average of all sub models coeffcients. ---------- Returns ---------- prediction array, shape [(t1 - t2 +1) ] forecasted value of the time series in the range [t1,t2] using index_name """ # get coefficients coeffs = np.array(interface.get_coeff(index_name + '_c_view', averaging)) no_coeff = len(coeffs) # get parameters end_index , agg_interval, start_ts = interface.query_table( index_name+'_meta',["last_TS_seen", 'agg_interval','start_time'])[0] agg_interval = float(agg_interval) if not isinstance(start_ts, (int, np.integer)): start_ts = pd.Timestamp(start_ts) end = index_ts_inv_mapper(start_ts, agg_interval, end_index) start = index_ts_inv_mapper(start_ts, agg_interval, end_index-no_coeff ) # the forecast should always start at the last point obs = interface.get_time_series( table_name, start, end, start_ts = start_ts, value_column=value_column, index_column= index_col, Desc=False, interval = agg_interval, aggregation_method = averaging) output = np.zeros(ahead+no_coeff) output[:no_coeff] = np.array(obs)[:,0] for i in range(0, ahead): output[i + no_coeff] = np.dot(coeffs.T, output[i:i + no_coeff]) return output[-ahead:]
def update_index(self): """ This function query new datapoints from the database using the variable self.TimeSeriesIndex and call the update_model function """ end_point = get_bound_time(self.db_interface, self.time_series_table_name, self.time_column, 'max') start_point = index_ts_inv_mapper(self.start_time, self.agg_interval, self.ts_model.TimeSeriesIndex//self.no_ts) new_entries = np.array(self._get_range(start_point, end_point), dtype = np.float) if len(new_entries) > 0: self.update_model(new_entries) self.write_model(False)
def create_index(self): """ This function query new datapoints from the database using the variable self.TimeSeriesIndex and call the update_model function """ # find starting and ending time end_point = get_bound_time(self.db_interface, self.time_series_table_name, self.time_column, 'max') start_point = index_ts_inv_mapper(self.start_time, self.agg_interval, self.ts_model.TimeSeriesIndex) # get new entries new_entries = self._get_range(start_point, end_point) if len(new_entries) > 0: self.update_model(new_entries) self.write_model(True) # drop and create trigger self.db_interface.drop_trigger(self.time_series_table_name, self.index_name) if self.auto_update: self.db_interface.create_insert_trigger(self.time_series_table_name, self.index_name)
def load_pindex_u(db_interface,index_name): t = time.time() meta_table = index_name + "_meta" meta_inf = db_interface.query_table(meta_table, columns_queried=['T', 'T0', 'k', 'gamma', 'var_direct_method', 'k_var', 'T_var', 'soft_thresholding', 'start_time', 'aggregation_method', 'agg_interval', 'persist_l','col_to_row_ratio', 'L','last_TS_fullSVD','last_TS_inc', 'last_TS_seen', 'p' ,'time_series_table_name', 'indexed_column','time_column']) T, T0, k, gamma, direct_var, k_var, T_var, SSVT, start_time, aggregation_method, agg_interval, persist_l, col_to_row_ratio, L, ReconIndex, MUpdateIndex, TimeSeriesIndex , p= meta_inf[0][:-3] L_m = db_interface.query_table(index_name + "_m", ['L'], 'modelno =0')[0][0] time_series_table_name, value_column, time_column = meta_inf[0][-3:] last = get_bound_time(db_interface, time_series_table_name, time_column ,'max') value_columns = value_column.split(',') # ------------------------------------------------------ # temp fix gamma = float(gamma) if not isinstance(start_time, (int, np.integer)): start_time = pd.to_datetime(start_time) if not isinstance(last, (int, np.integer)): last = pd.to_datetime(start_time) agg_interval = float(agg_interval) # ------------------------------------------------------ no_ts = len(value_columns) last_index = (index_ts_mapper(start_time, agg_interval, last) + 1) if last_index - MUpdateIndex//no_ts <= 5*L_m: print(L, last_index, MUpdateIndex) print('nothing major to update') return False if p < 1.0: fill_in_missing = False else: fill_in_missing = True TSPD = TSPI(interface=db_interface, index_name=index_name, schema=None, T=T, T0=T0, rank=k, gamma=gamma, direct_var=direct_var, rank_var=k_var, T_var=T_var, SSVT=SSVT, start_time=start_time, aggregation_method=aggregation_method, agg_interval=agg_interval, time_series_table_name=time_series_table_name, time_column = time_column, value_column = value_columns ,persist_L = persist_l,col_to_row_ratio = col_to_row_ratio, fill_in_missing = fill_in_missing, p =p) model_no = int(max((last_index*no_ts - 1) / (T / 2) - 1, 0)) last_model_no = int(max((MUpdateIndex - 1) / (T / 2) - 1, 0)) model_start = last_model_no*T/2 print(model_no, last_model_no, ReconIndex, model_start, last_index) new_points_ratio = (last_index*no_ts - ReconIndex)/(ReconIndex - model_start) print(new_points_ratio) if new_points_ratio < gamma and model_no <= last_model_no and (last_index*no_ts)%(T//2) != 0: print('marginal update') start = (MUpdateIndex)//TSPD.no_ts end = (TimeSeriesIndex - 1)//TSPD.no_ts else: print('big update') start = max((TimeSeriesIndex - T)//TSPD.no_ts,0) end = (TimeSeriesIndex - 1)//TSPD.no_ts # initiate TSPI object TSPD.ts_model = TSMM(TSPD.k, TSPD.T, TSPD.gamma, TSPD.T0, col_to_row_ratio=col_to_row_ratio, model_table_name=index_name, SSVT=TSPD.SSVT, L=L, persist_L = TSPD.persist_L, no_ts = TSPD.no_ts, fill_in_missing = fill_in_missing, p =p) TSPD.ts_model.ReconIndex, TSPD.ts_model.MUpdateIndex, TSPD.ts_model.TimeSeriesIndex = ReconIndex, MUpdateIndex, TimeSeriesIndex # load variance models if any if TSPD.k_var != 0: col_to_row_ratio, L, ReconIndex, MUpdateIndex, TimeSeriesIndex = db_interface.query_table(meta_table, columns_queried=[ 'col_to_row_ratio_var', 'L_var', 'last_TS_fullSVD_var', 'last_TS_inc_var', 'last_TS_seen_var'])[0] TSPD.var_model = TSMM(TSPD.k_var, TSPD.T_var, TSPD.gamma, TSPD.T0, col_to_row_ratio=col_to_row_ratio, model_table_name=index_name + "_variance", SSVT=TSPD.SSVT, L=L, persist_L =TSPD.persist_L, no_ts = TSPD.no_ts, fill_in_missing = fill_in_missing, p =p) TSPD.var_model.ReconIndex, TSPD.var_model.MUpdateIndex, TSPD.var_model.TimeSeriesIndex = ReconIndex, MUpdateIndex, TimeSeriesIndex print('loading meta_model time', time.time()-t) # LOADING SUB-MODELs Information TSPD._load_models_from_db(TSPD.ts_model) print('loading sub models time', time.time()-t) if end >= start: start_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, start) end_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, end) TSPD.ts_model.TimeSeries = TSPD._get_range(start_point, end_point) print('loading time series time', time.time()-t) print(start, end, start_point,end_point) # query variance models table if TSPD.k_var != 0: TSPD._load_models_from_db(TSPD.var_model) # load last T points of variance time series (squared of observations if not direct_var) if TSPD.direct_var: end_var = (TSPD.var_model.TimeSeriesIndex - 1)//TSPD.no_ts start = max(start -1,0) TT = min(end_var-start+1, TSPD.var_model.T//TSPD.no_ts) if (end_var-start+1) - TT >0: start += (end_var-start+1) - TT mean = np.zeros([TT,TSPD.no_ts]) print(mean.shape, start, end_var, TSPD.var_model.T ) start_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, start) end_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, end_var) print(start, end_var, start_point,end_point,TT) if end_var != start: for ts_n, value_column in enumerate(TSPD.value_column): mean[:,ts_n] = get_prediction_range(index_name, TSPD.time_series_table_name, value_column,db_interface, start_point, end_point, uq=False) TSPD.var_model.TimeSeries = TSPD.ts_model.TimeSeries[:len(mean),:] - mean else: TSPD.var_model.TimeSeries = (TSPD.ts_model.TimeSeries) ** 2 print('loading time series variance time', time.time()-t) return TSPD
def write_model(self, create): """ write the pindex to db ---------- Parameters ---------- create: bol if Ture, create the index in DB, else update it. """ # remove schema name if exist t = time.time() index_name = self.index_name.split('.')[1] # delete meta data if create if create: delete_pindex(self.db_interface, index_name) # write mean and variance tables self.write_tsmm_model(self.ts_model, create) self.write_tsmm_model(self.var_model, create) self.calculate_out_of_sample_error(self.ts_model) # if time is timestamp, convert to pd.Timestamp if not isinstance(self.start_time, (int, np.integer)): self.start_time = pd.to_datetime(self.start_time) # prepare meta data table metadf = pd.DataFrame( data={'T': [self.ts_model.T], 'T0': [self.T0], 'gamma': [float(self.gamma)], 'k': [self.k], 'L': [self.ts_model.L], 'last_TS_seen': [self.ts_model.TimeSeriesIndex], 'last_TS_inc': [self.ts_model.MUpdateIndex], 'last_TS_fullSVD': [self.ts_model.ReconIndex], 'time_series_table_name': [self.time_series_table_name], 'indexed_column': [','.join(self.value_column)], 'time_column': [self.time_column], 'soft_thresholding': [self.SSVT], 'no_submodels': [len(self.ts_model.models)], 'no_submodels_var': [len(self.var_model.models)], 'col_to_row_ratio': [self.ts_model.col_to_row_ratio], 'col_to_row_ratio_var': [self.var_model.col_to_row_ratio], 'T_var': [self.var_model.T], 'k_var': [self.k_var], 'L_var': [self.var_model.L], 'last_TS_seen_var': [self.var_model.TimeSeriesIndex], 'last_TS_inc_var': [self.var_model.MUpdateIndex], 'aggregation_method': [self.aggregation_method], 'agg_interval': [self.agg_interval], 'start_time': [self.start_time], 'last_TS_fullSVD_var': [self.var_model.ReconIndex], 'var_direct_method': [self.direct_var], 'persist_l': [self.persist_L], 'p': [self.ts_model.p]}) # ------------------------------------------------------ # EDIT: Due to some incompatibiliy with PSQL timestamp types # Further investigate # ------------------------------------------------------ if not isinstance(self.start_time, (int, np.integer)): #metadf['start_time'] = metadf['start_time'].astype(pd.Timestamp) metadf['start_time'] = metadf['start_time'].astype('datetime64[ns]') last_index = index_ts_inv_mapper(self.start_time, self.agg_interval, self.ts_model.TimeSeriesIndex//self.no_ts -1) if create: # create meta table self.db_interface.create_table(self.index_name + '_meta', metadf, include_index=False) # populate column pindices for i,ts in enumerate(self.value_column): self.db_interface.insert('tspdb.pindices_columns', [index_name, ts], columns=['index_name', 'value_column']) else: # else update meta table, tspdb pindices self.db_interface.delete(self.index_name + '_meta', '') self.db_interface.insert(self.index_name + '_meta', metadf.iloc[0]) self.db_interface.delete('tspdb.pindices', "index_name = '" + str(index_name) + "';") self.db_interface.delete('tspdb.pindices_stats', "index_name = '" + str(index_name) + "';") # UPDATE STAT TABLE for i,ts in enumerate(self.value_column): forecast_tests_array = np.array([m.forecast_model_score_test[i] for m in self.ts_model.models.values()],'float') self.db_interface.insert('tspdb.pindices_stats', [index_name, ts, self.ts_model.TimeSeriesIndex//self.no_ts, len(self.ts_model.models),np.mean([ m.imputation_model_score[i] for m in self.ts_model.models.values() ]), np.mean([ m.forecast_model_score[i] for m in self.ts_model.models.values()]),np.nanmean(forecast_tests_array)], columns=['index_name', 'column_name','number_of_observations', 'number_of_trained_models', 'imputation_score', 'forecast_score','test_forecast_score']) # UPDATE PINDICES TABLE if isinstance(self.start_time, (int, np.integer)): self.db_interface.insert('tspdb.pindices', [index_name, self.time_series_table_name, self.time_column, self.uq, self.agg_interval, self.start_time, last_index], columns=['index_name', 'relation', 'time_column', 'uq', 'agg_interval', 'initial_index', 'last_index']) else: self.db_interface.insert('tspdb.pindices', [index_name, self.time_series_table_name, self.time_column, self.uq, self.agg_interval, self.start_time, last_index], columns=['index_name', 'relation', 'time_column', 'uq', 'agg_interval', 'initial_timestamp', 'last_timestamp'])
def _get_forecast_range(index_name,table_name, value_column, index_col, interface, t1, t2,MUpdateIndex,L,k,T,last_model, interval, start_ts, last_TS_seen,no_ts, value_index,direct_var = False,variance = False,averaging = 'average', projected = False,p = 1.0): """ Return the forecasted value in the past at the time range t1 to t2 for the value of column_name using index_name ---------- Parameters ---------- index_name: string name of the PINDEX used to query the prediction index_name: table_name name of the time series table in the database value_column: string name of column than contain time series value index_col: string name of column that contains time series index/timestamp interface: db_class object object used to communicate with the DB. see ../database/db_class for the abstract class t1: (int or timestamp) index or timestamp indicating the start of the queried range t2: (int or timestamp) index or timestamp indicating the end of the queried range L: (int) Model parameter determining the number of rows in each matrix in a sub model. k: (int ) Model parameter determining the number of retained singular values in each matrix in a sub model. T: (int ) Model parameter determining the number of datapoints in each matrix in a sub model. last_model: (int ) The index of the last sub model averaging: string, optional, (default 'average') Coefficients used when forecasting, 'average' means use the average of all sub models coeffcients. ---------- Returns ---------- prediction array, shape [(t1 - t2 +1) ] forecasted value of the time series in the range [t1,t2] using index_name """ ############### EDITS ################## #1- Replace last_ts with the last time stamp seen ######################################## # get coefficients coeffs = np.array(interface.get_coeff(index_name + '_c_view', averaging)) coeffs_ts = coeffs[-no_ts:] coeffs = coeffs[:-no_ts] no_coeff = len(coeffs) if not direct_var or not variance: if projected: if last_model != 0: q_model = last_model- 1 else: q_model = last_model U = interface.get_U_row(index_name + '_u', [0, 2 * L], [q_model, q_model], k, return_modelno=False,return_weights_decom=True)[:-1,k:] no_coeff = U.shape[0] projection_matrix = np.dot(U,U.T) agg_interval = float(interval) if not isinstance(start_ts, (int, np.integer)): start_ts = pd.Timestamp(start_ts) # if the range queries is beyond what we have so far, get the last point seen last_TS_seen = get_bound_time(interface, table_name, index_col, 'max') if not isinstance(last_TS_seen, (int, np.integer)): last_TS_seen = index_ts_mapper(start_ts, agg_interval, last_TS_seen) last_TS_seen+=1 print(t1,t2, last_TS_seen) t1_ = min(t1, last_TS_seen) t2_ = min(t2, last_TS_seen) end = index_ts_inv_mapper(start_ts, agg_interval, t1_ - 1 ) start = index_ts_inv_mapper(start_ts, agg_interval, t1_ - no_coeff ) print(start, end) obs = interface.get_time_series(table_name, start, end, start_ts = start_ts, value_column=value_column, index_column= index_col, Desc=False, interval = agg_interval, aggregation_method = averaging) output = np.zeros([t2 - t1_ + 1 ]) obs = np.array(obs)[-no_coeff:,0] print(len(obs[:]), no_coeff) # Fill using fill_method if p <1: obs = np.array(pd.DataFrame(obs).fillna(value = 0).values[:,0]) obs /= p else: obs = np.array(pd.DataFrame(obs).fillna(method = 'ffill').values[:,0]) obs = np.array(pd.DataFrame(obs).fillna(method = 'bfill').values[:,0]) if variance: obs = obs **2 observations = np.zeros([t2 - t1_ + 1 + no_coeff]) observations[:no_coeff] = obs for i in range(0, t2 + 1 - t1_): if i < len(obs): if projected: output[i] = np.dot(coeffs.T, np.dot(projection_matrix, observations[i:i + no_coeff]))+coeffs_ts[value_index] else: output[i] = np.dot(coeffs.T, observations[i:i + no_coeff])+coeffs_ts[value_index] else: output[i] = np.dot(coeffs.T, observations[i:i + no_coeff])+coeffs_ts[value_index] if i+no_coeff >= len(obs): observations[i+no_coeff] = output[i] return output[-(t2 - t1 + 1):] # the forecast should always start at the last point t1_ = MUpdateIndex//no_ts output = np.zeros([t2 - t1_ + 1 + no_coeff]) output[:no_coeff] = _get_imputation_range(index_name, table_name, value_column, index_col, interface, t1_ - no_coeff, t1_ - 1, L,k,T,last_model,value_index, no_ts) for i in range(0, t2 + 1 - t1_): output[i + no_coeff] = np.dot(coeffs.T, output[i:i + no_coeff])+coeffs_ts[value_index] return output[-(t2 - t1 + 1):]