def load_pindex_u(db_interface,index_name): t = time.time() meta_table = index_name + "_meta" meta_inf = db_interface.query_table(meta_table, columns_queried=['T', 'T0', 'k', 'gamma', 'var_direct_method', 'k_var', 'T_var', 'soft_thresholding', 'start_time', 'aggregation_method', 'agg_interval', 'persist_l','col_to_row_ratio', 'L','last_TS_fullSVD','last_TS_inc', 'last_TS_seen', 'p' ,'time_series_table_name', 'indexed_column','time_column']) T, T0, k, gamma, direct_var, k_var, T_var, SSVT, start_time, aggregation_method, agg_interval, persist_l, col_to_row_ratio, L, ReconIndex, MUpdateIndex, TimeSeriesIndex , p= meta_inf[0][:-3] L_m = db_interface.query_table(index_name + "_m", ['L'], 'modelno =0')[0][0] time_series_table_name, value_column, time_column = meta_inf[0][-3:] last = get_bound_time(db_interface, time_series_table_name, time_column ,'max') value_columns = value_column.split(',') # ------------------------------------------------------ # temp fix gamma = float(gamma) if not isinstance(start_time, (int, np.integer)): start_time = pd.to_datetime(start_time) if not isinstance(last, (int, np.integer)): last = pd.to_datetime(start_time) agg_interval = float(agg_interval) # ------------------------------------------------------ no_ts = len(value_columns) last_index = (index_ts_mapper(start_time, agg_interval, last) + 1) if last_index - MUpdateIndex//no_ts <= 5*L_m: print(L, last_index, MUpdateIndex) print('nothing major to update') return False if p < 1.0: fill_in_missing = False else: fill_in_missing = True TSPD = TSPI(interface=db_interface, index_name=index_name, schema=None, T=T, T0=T0, rank=k, gamma=gamma, direct_var=direct_var, rank_var=k_var, T_var=T_var, SSVT=SSVT, start_time=start_time, aggregation_method=aggregation_method, agg_interval=agg_interval, time_series_table_name=time_series_table_name, time_column = time_column, value_column = value_columns ,persist_L = persist_l,col_to_row_ratio = col_to_row_ratio, fill_in_missing = fill_in_missing, p =p) model_no = int(max((last_index*no_ts - 1) / (T / 2) - 1, 0)) last_model_no = int(max((MUpdateIndex - 1) / (T / 2) - 1, 0)) model_start = last_model_no*T/2 print(model_no, last_model_no, ReconIndex, model_start, last_index) new_points_ratio = (last_index*no_ts - ReconIndex)/(ReconIndex - model_start) print(new_points_ratio) if new_points_ratio < gamma and model_no <= last_model_no and (last_index*no_ts)%(T//2) != 0: print('marginal update') start = (MUpdateIndex)//TSPD.no_ts end = (TimeSeriesIndex - 1)//TSPD.no_ts else: print('big update') start = max((TimeSeriesIndex - T)//TSPD.no_ts,0) end = (TimeSeriesIndex - 1)//TSPD.no_ts # initiate TSPI object TSPD.ts_model = TSMM(TSPD.k, TSPD.T, TSPD.gamma, TSPD.T0, col_to_row_ratio=col_to_row_ratio, model_table_name=index_name, SSVT=TSPD.SSVT, L=L, persist_L = TSPD.persist_L, no_ts = TSPD.no_ts, fill_in_missing = fill_in_missing, p =p) TSPD.ts_model.ReconIndex, TSPD.ts_model.MUpdateIndex, TSPD.ts_model.TimeSeriesIndex = ReconIndex, MUpdateIndex, TimeSeriesIndex # load variance models if any if TSPD.k_var != 0: col_to_row_ratio, L, ReconIndex, MUpdateIndex, TimeSeriesIndex = db_interface.query_table(meta_table, columns_queried=[ 'col_to_row_ratio_var', 'L_var', 'last_TS_fullSVD_var', 'last_TS_inc_var', 'last_TS_seen_var'])[0] TSPD.var_model = TSMM(TSPD.k_var, TSPD.T_var, TSPD.gamma, TSPD.T0, col_to_row_ratio=col_to_row_ratio, model_table_name=index_name + "_variance", SSVT=TSPD.SSVT, L=L, persist_L =TSPD.persist_L, no_ts = TSPD.no_ts, fill_in_missing = fill_in_missing, p =p) TSPD.var_model.ReconIndex, TSPD.var_model.MUpdateIndex, TSPD.var_model.TimeSeriesIndex = ReconIndex, MUpdateIndex, TimeSeriesIndex print('loading meta_model time', time.time()-t) # LOADING SUB-MODELs Information TSPD._load_models_from_db(TSPD.ts_model) print('loading sub models time', time.time()-t) if end >= start: start_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, start) end_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, end) TSPD.ts_model.TimeSeries = TSPD._get_range(start_point, end_point) print('loading time series time', time.time()-t) print(start, end, start_point,end_point) # query variance models table if TSPD.k_var != 0: TSPD._load_models_from_db(TSPD.var_model) # load last T points of variance time series (squared of observations if not direct_var) if TSPD.direct_var: end_var = (TSPD.var_model.TimeSeriesIndex - 1)//TSPD.no_ts start = max(start -1,0) TT = min(end_var-start+1, TSPD.var_model.T//TSPD.no_ts) if (end_var-start+1) - TT >0: start += (end_var-start+1) - TT mean = np.zeros([TT,TSPD.no_ts]) print(mean.shape, start, end_var, TSPD.var_model.T ) start_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, start) end_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, end_var) print(start, end_var, start_point,end_point,TT) if end_var != start: for ts_n, value_column in enumerate(TSPD.value_column): mean[:,ts_n] = get_prediction_range(index_name, TSPD.time_series_table_name, value_column,db_interface, start_point, end_point, uq=False) TSPD.var_model.TimeSeries = TSPD.ts_model.TimeSeries[:len(mean),:] - mean else: TSPD.var_model.TimeSeries = (TSPD.ts_model.TimeSeries) ** 2 print('loading time series variance time', time.time()-t) return TSPD
def _get_forecast_range(index_name,table_name, value_column, index_col, interface, t1, t2,MUpdateIndex,L,k,T,last_model, interval, start_ts, last_TS_seen,no_ts, value_index,direct_var = False,variance = False,averaging = 'average', projected = False,p = 1.0): """ Return the forecasted value in the past at the time range t1 to t2 for the value of column_name using index_name ---------- Parameters ---------- index_name: string name of the PINDEX used to query the prediction index_name: table_name name of the time series table in the database value_column: string name of column than contain time series value index_col: string name of column that contains time series index/timestamp interface: db_class object object used to communicate with the DB. see ../database/db_class for the abstract class t1: (int or timestamp) index or timestamp indicating the start of the queried range t2: (int or timestamp) index or timestamp indicating the end of the queried range L: (int) Model parameter determining the number of rows in each matrix in a sub model. k: (int ) Model parameter determining the number of retained singular values in each matrix in a sub model. T: (int ) Model parameter determining the number of datapoints in each matrix in a sub model. last_model: (int ) The index of the last sub model averaging: string, optional, (default 'average') Coefficients used when forecasting, 'average' means use the average of all sub models coeffcients. ---------- Returns ---------- prediction array, shape [(t1 - t2 +1) ] forecasted value of the time series in the range [t1,t2] using index_name """ ############### EDITS ################## #1- Replace last_ts with the last time stamp seen ######################################## # get coefficients coeffs = np.array(interface.get_coeff(index_name + '_c_view', averaging)) coeffs_ts = coeffs[-no_ts:] coeffs = coeffs[:-no_ts] no_coeff = len(coeffs) if not direct_var or not variance: if projected: if last_model != 0: q_model = last_model- 1 else: q_model = last_model U = interface.get_U_row(index_name + '_u', [0, 2 * L], [q_model, q_model], k, return_modelno=False,return_weights_decom=True)[:-1,k:] no_coeff = U.shape[0] projection_matrix = np.dot(U,U.T) agg_interval = float(interval) if not isinstance(start_ts, (int, np.integer)): start_ts = pd.Timestamp(start_ts) # if the range queries is beyond what we have so far, get the last point seen last_TS_seen = get_bound_time(interface, table_name, index_col, 'max') if not isinstance(last_TS_seen, (int, np.integer)): last_TS_seen = index_ts_mapper(start_ts, agg_interval, last_TS_seen) last_TS_seen+=1 print(t1,t2, last_TS_seen) t1_ = min(t1, last_TS_seen) t2_ = min(t2, last_TS_seen) end = index_ts_inv_mapper(start_ts, agg_interval, t1_ - 1 ) start = index_ts_inv_mapper(start_ts, agg_interval, t1_ - no_coeff ) print(start, end) obs = interface.get_time_series(table_name, start, end, start_ts = start_ts, value_column=value_column, index_column= index_col, Desc=False, interval = agg_interval, aggregation_method = averaging) output = np.zeros([t2 - t1_ + 1 ]) obs = np.array(obs)[-no_coeff:,0] print(len(obs[:]), no_coeff) # Fill using fill_method if p <1: obs = np.array(pd.DataFrame(obs).fillna(value = 0).values[:,0]) obs /= p else: obs = np.array(pd.DataFrame(obs).fillna(method = 'ffill').values[:,0]) obs = np.array(pd.DataFrame(obs).fillna(method = 'bfill').values[:,0]) if variance: obs = obs **2 observations = np.zeros([t2 - t1_ + 1 + no_coeff]) observations[:no_coeff] = obs for i in range(0, t2 + 1 - t1_): if i < len(obs): if projected: output[i] = np.dot(coeffs.T, np.dot(projection_matrix, observations[i:i + no_coeff]))+coeffs_ts[value_index] else: output[i] = np.dot(coeffs.T, observations[i:i + no_coeff])+coeffs_ts[value_index] else: output[i] = np.dot(coeffs.T, observations[i:i + no_coeff])+coeffs_ts[value_index] if i+no_coeff >= len(obs): observations[i+no_coeff] = output[i] return output[-(t2 - t1 + 1):] # the forecast should always start at the last point t1_ = MUpdateIndex//no_ts output = np.zeros([t2 - t1_ + 1 + no_coeff]) output[:no_coeff] = _get_imputation_range(index_name, table_name, value_column, index_col, interface, t1_ - no_coeff, t1_ - 1, L,k,T,last_model,value_index, no_ts) for i in range(0, t2 + 1 - t1_): output[i + no_coeff] = np.dot(coeffs.T, output[i:i + no_coeff])+coeffs_ts[value_index] return output[-(t2 - t1 + 1):]
def get_prediction_range( index_name, table_name, value_column, interface, t1,t2 , uq = True, uq_method ='Gaussian', c = 95., projected = False): """ Return an array of N (N = t2-t1+1) predicted value along with the confidence interval for the value of column_name at time t1 to t2 using index_name by calling either forecast_range or impute_range function ---------- Parameters ---------- index_name: string name of the PINDEX used to query the prediction table_name: string name of the time series table in the database value_column: string name of column than contain time series value interface: db_class object object used to communicate with the DB. see ../database/db_class for the abstract class t1: (int or timestamp) index or timestamp indicating the start of the queried range t2: (int or timestamp) index or timestamp indicating the end of the queried range uq: boolean optional (default=true) if true, return upper and lower bound of the c% confidenc interval uq_method: string optional (defalut = 'Gaussian') options: {'Gaussian', 'Chebyshev'} Uncertainty quantification method used to estimate the confidence interval c: float optional (default 95.) confidence level for uncertainty quantification, 0<c<100 ---------- Returns ---------- prediction array, shape [(t1 - t2 +1) ] Values of the predicted point of the time series in the time interval t1 to t2 deviation array, shape [1, (t1 - t2 +1) ] The deviation from the mean to get the desired confidence level """ # query pindex parameters T,T_var, L, k,k_var, L_var, last_model, MUpdateIndex,var_direct, interval, start_ts, last_TS_seen, last_TS_seen_var, index_col, value_columns, MUpdateIndex_var, p = interface.query_table( index_name+'_meta',['T','T_var', 'L', 'k','k_var','L_var', 'no_submodels', 'last_TS_inc', 'var_direct_method', 'agg_interval','start_time', "last_TS_seen", "last_TS_seen_var", "time_column","indexed_column",'last_TS_inc_var','p'])[0] last_model -= 1 value_columns = value_columns.split(',') no_ts = len(value_columns) try: value_index = value_columns.index(value_column) except: raise Exception('The value column %s selected is not indexed by the chosen pindex'%(value_column)) if not isinstance(t1, (int, np.integer)): t1 = pd.to_datetime(t1) t2 = pd.to_datetime(t2) start_ts = pd.to_datetime(start_ts) interval = float(interval) t1 = index_ts_mapper(start_ts, interval, t1) t2 = index_ts_mapper(start_ts, interval, t2) if MUpdateIndex == 0: last_TS_seen = get_bound_time(interface, table_name, index_col, 'max') obs = interface.get_time_series(table_name, start_ts, last_TS_seen, start_ts = start_ts, value_column=value_column, index_column= index_col, Desc=False, interval = interval, aggregation_method = 'average') if uq: return np.mean(obs)*np.ones(t2-t1+1), np.zeros(t2-t1+1) else: return np.mean(obs)*np.ones(t2-t1+1) # check uq variables if uq: if c < 0 or c >=100: raise Exception('confidence interval c must be in the range (0,100): 0 <=c< 100') if uq_method == 'Chebyshev': alpha = 1./(np.sqrt(1-c/100)) elif uq_method == 'Gaussian': alpha = norm.ppf(1/2 + c/200) else: raise Exception('uq_method option is not recognized, available options are: "Gaussian" or "Chebyshev"') # if all points are in the future, use _get_forecast_range if t1 > (MUpdateIndex - 1)//no_ts: print('forecasting') if not uq: return _get_forecast_range(index_name,table_name, value_column, index_col, interface, t1,t2, MUpdateIndex,L,k,T,last_model,interval, start_ts, last_TS_seen,no_ts,value_index, projected = projected, p = p) else: prediction = _get_forecast_range(index_name,table_name, value_column, index_col, interface, t1,t2, MUpdateIndex,L,k,T,last_model,interval, start_ts, last_TS_seen,no_ts,value_index, projected = projected, p = p) var = _get_forecast_range(index_name+'_variance',table_name, value_column, index_col, interface, t1,t2, MUpdateIndex_var, L,k_var,T_var,last_model,interval, start_ts, last_TS_seen_var, no_ts,value_index,variance = True, direct_var =var_direct, projected = projected,p = p) # if the second model is used for the second moment, subtract the squared mean to estimate the variance if not var_direct: var = var - (prediction)**2 var *= (var>0) return prediction, alpha*np.sqrt(var) # if all points are in the past, use get_imputation_range elif t2 <= (MUpdateIndex - 1)//no_ts: if not uq: return _get_imputation_range(index_name, table_name, value_column, index_col, interface, t1,t2,L,k,T,last_model, value_index, no_ts,p = p) else: prediction = _get_imputation_range(index_name, table_name, value_column, index_col, interface, t1,t2,L,k,T,last_model, value_index, no_ts,p = p) if (MUpdateIndex_var-1)//no_ts >= t2: var = _get_imputation_range(index_name+'_variance',table_name, value_column, index_col, interface, t1,t2, L_var,k_var,T_var,last_model, value_index, no_ts,p = p) else: imputations_var = _get_imputation_range(index_name+'_variance', table_name, value_column, index_col, interface, t1,(MUpdateIndex_var-1)//no_ts,L_var,k_var,T_var,last_model, value_index, no_ts,p = p) forecast_var = _get_forecast_range(index_name+'_variance',table_name, value_column, index_col, interface,MUpdateIndex_var//no_ts ,t2, MUpdateIndex_var,L_var,k_var,T_var,last_model,interval, start_ts,last_TS_seen, no_ts,value_index,variance = True, direct_var =var_direct,projected = projected,p = p) var = np.array(list(imputations_var)+list(forecast_var)) # if the second model is used for the second moment, subtract the squared mean to estimate the variance if not var_direct: var = var - (prediction)**2 var *= (var>0) return prediction, alpha*np.sqrt(var) # if points are in both the future and in the past, use both else: imputations = _get_imputation_range(index_name, table_name, value_column, index_col, interface, t1,(MUpdateIndex-1)//no_ts,L,k,T,last_model,value_index, no_ts,p = p) forecast = _get_forecast_range(index_name,table_name, value_column, index_col, interface,(MUpdateIndex)//no_ts ,t2, MUpdateIndex,L,k,T,last_model,interval, start_ts,last_TS_seen, no_ts,value_index,projected = projected,p = p) if not uq: return list(imputations)+list(forecast) else: imputations_var = _get_imputation_range(index_name+'_variance', table_name, value_column, index_col, interface, t1,(MUpdateIndex_var-1)//no_ts,L_var,k_var,T_var,last_model, value_index, no_ts,p = p) forecast_var = _get_forecast_range(index_name+'_variance',table_name, value_column, index_col, interface,MUpdateIndex_var//no_ts ,t2, MUpdateIndex_var,L_var,k_var,T_var,last_model,interval, start_ts,last_TS_seen, no_ts,value_index,variance = True, direct_var =var_direct,projected = projected,p = p) if not var_direct: forecast_var = forecast_var - (forecast)**2 imputations_var = imputations_var - (imputations)**2 imputations_var *= (imputations_var>0) forecast_var *= (forecast_var>0) return np.array(list(imputations)+list(forecast)), np.array(list(alpha*np.sqrt(imputations_var)) + list(alpha*np.sqrt(forecast_var)))
def get_prediction(index_name, table_name, value_column, interface, t, uq = True, uq_method ='Gaussian', c = 95, projected = False): """ Return the predicted value along with the confidence interval for the value of column_name at time t using index_name by calling either get_forecast or get_imputation function ---------- Parameters ---------- index_name: string name of the PINDEX used to query the prediction index_name: table_name name of the time series table in the database value_column: string name of column than contain time series value interface: db_class object object used to communicate with the DB. see ../database/db_class for the abstract class t: (int or timestamp) index or timestamp indicating the queried time. uq: boolean optional (default=true) if true, return upper and lower bound of the c% confidenc interval uq_method: string optional (defalut = 'Gaussian') options: {'Gaussian', 'Chebyshev'} Uncertainty quantification method used to estimate the confidence interval c: float optional (default 95.) confidence level for uncertainty quantification, 0<c<100 ---------- Returns ---------- prediction float Values of time series at time t deviation float The deviation from the mean to get the desired confidence level """ # query pindex parameters T,T_var, L, k,k_var, L_var, last_model, MUpdateIndex,var_direct, interval, start_ts, last_TS_seen, last_TS_seen_var, index_col, value_columns, MUpdateIndex_var, p = interface.query_table( index_name+'_meta',['T','T_var', 'L', 'k','k_var','L_var', 'no_submodels', 'last_TS_inc', 'var_direct_method', 'agg_interval','start_time', "last_TS_seen", "last_TS_seen_var", "time_column","indexed_column",'last_TS_inc_var','p'])[0] ############ Fix queried values #################### last_model -= 1 value_columns = value_columns.split(',') no_ts = len(value_columns) if not isinstance(t, (int, np.integer)): t = pd.to_datetime(t) start_ts = pd.to_datetime(start_ts) interval = float(interval) ################################################### # Check 1: value colmn is indexed try: value_index = value_columns.index(value_column) except: raise Exception('The value column selected is not indexed by the chosen pindex') # if the model is not fit, return the average if MUpdateIndex == 0: last_TS_seen = get_bound_time(interface, table_name, index_col, 'max') obs = interface.get_time_series(table_name, start_ts, last_TS_seen, start_ts = start_ts, value_column=value_column, index_column= index_col, Desc=False, interval = interval, aggregation_method = 'average') if uq: return np.mean(obs), 0 else: return np.mean(obs) t = index_ts_mapper(start_ts, interval, t) if uq: if uq_method == 'Chebyshev': alpha = 1./(np.sqrt(1-c/100)) elif uq_method == 'Gaussian': alpha = norm.ppf(1/2 + c/200) else: raise Exception('uq_method option is not recognized, available options are: "Gaussian" or "Chebyshev"') if t > (MUpdateIndex - 1)//no_ts: if not uq: return _get_forecast_range(index_name,table_name, value_column, index_col, interface,t, t, MUpdateIndex,L,k,T,last_model, interval, start_ts, last_TS_seen,no_ts,value_index, projected = projected,p = p)[-1] else: prediction = _get_forecast_range(index_name,table_name, value_column, index_col, interface,t, t, MUpdateIndex,L,k,T,last_model, interval, start_ts,last_TS_seen, no_ts,value_index, projected = projected,p = p)[-1] var = _get_forecast_range(index_name+'_variance',table_name, value_column, index_col, interface,t, t, MUpdateIndex_var,L_var,k_var,T_var,last_model,interval, start_ts,last_TS_seen_var,no_ts,value_index, projected = projected, variance = True, direct_var =var_direct,p = p)[-1] if not var_direct: var = var - (prediction)**2 var *= (var>0) return prediction, alpha*np.sqrt(var) else: if not uq: return _get_imputation(index_name, table_name, value_column, index_col, interface, t,L,k,T,last_model,no_ts,value_index,p = p) else: prediction = _get_imputation(index_name, table_name, value_column, index_col, interface, t,L,k,T,last_model, no_ts,value_index,p = p) if t > (MUpdateIndex_var - 1)//no_ts: var = _get_forecast_range(index_name+'_variance',table_name, value_column, index_col, interface,t, t, MUpdateIndex_var,L_var,k_var,T_var,last_model,interval, start_ts,last_TS_seen_var,no_ts,value_index, projected = projected, variance = True, direct_var =var_direct, p = p)[-1] else: var = _get_imputation(index_name+'_variance',table_name, value_column, index_col, interface, t, L_var,k_var,T_var,last_model, no_ts,value_index,p = p) if not var_direct: var = var - (prediction)**2 var *= (var>0) return prediction, alpha*np.sqrt(var)