示例#1
0
def forecast_next(index_name,table_name, value_column, index_col, interface, averaging = 'last1', ahead = 1):
    """
    Return the florcasted value in the past at the time range t1 to t2 for the value of column_name using index_name 
    ----------
    Parameters
    ----------
    index_name: string 
        name of the PINDEX used to query the prediction

    index_name: table_name 
        name of the time series table in the database

    value_column: string
        name of column than contain time series value

    index_col: string  
        name of column that contains time series index/timestamp

    interface: db_class object
        object used to communicate with the DB. see ../database/db_class for the abstract class
    
    averaging: string, optional, (default 'average')
        Coefficients used when forecasting, 'average' means use the average of all sub models coeffcients. 
    ----------
    Returns
    ----------
    prediction  array, shape [(t1 - t2 +1)  ]
        forecasted value of the time series  in the range [t1,t2]  using index_name
    """
    # get coefficients
    coeffs = np.array(interface.get_coeff(index_name + '_c_view', averaging))
    no_coeff = len(coeffs)
    # get parameters
    end_index , agg_interval, start_ts = interface.query_table( index_name+'_meta',["last_TS_seen", 'agg_interval','start_time'])[0]
    agg_interval = float(agg_interval)
    
    if not isinstance(start_ts, (int, np.integer)):
        start_ts = pd.Timestamp(start_ts)
     
    end = index_ts_inv_mapper(start_ts, agg_interval, end_index)
    start = index_ts_inv_mapper(start_ts, agg_interval, end_index-no_coeff )
    # the forecast should always start at the last point
    obs = interface.get_time_series( table_name, start, end, start_ts = start_ts,  value_column=value_column, index_column= index_col, Desc=False, interval = agg_interval, aggregation_method = averaging)
    output = np.zeros(ahead+no_coeff)
    output[:no_coeff] = np.array(obs)[:,0]
    for i in range(0, ahead):
        output[i + no_coeff] = np.dot(coeffs.T, output[i:i + no_coeff])
    return output[-ahead:]
示例#2
0
 def update_index(self):
     """
     This function query new datapoints from the database using the variable self.TimeSeriesIndex and call the
     update_model function
     """
     end_point = get_bound_time(self.db_interface, self.time_series_table_name, self.time_column, 'max')
     start_point = index_ts_inv_mapper(self.start_time, self.agg_interval, self.ts_model.TimeSeriesIndex//self.no_ts)
     new_entries =  np.array(self._get_range(start_point, end_point), dtype = np.float)
     if len(new_entries) > 0:
         self.update_model(new_entries)
         self.write_model(False)
示例#3
0
 def create_index(self):
     """
     This function query new datapoints from the database using the variable self.TimeSeriesIndex and call the
     update_model function
     """
     # find starting and ending time 
     end_point = get_bound_time(self.db_interface, self.time_series_table_name, self.time_column, 'max')
     start_point = index_ts_inv_mapper(self.start_time, self.agg_interval, self.ts_model.TimeSeriesIndex)
     
     # get new entries
     new_entries = self._get_range(start_point, end_point)
     if len(new_entries) > 0:
         self.update_model(new_entries)
         self.write_model(True)
     
     # drop and create trigger
     self.db_interface.drop_trigger(self.time_series_table_name, self.index_name)
     if self.auto_update:
         self.db_interface.create_insert_trigger(self.time_series_table_name, self.index_name)
示例#4
0
def load_pindex_u(db_interface,index_name):
    t = time.time()
    meta_table = index_name + "_meta"
    meta_inf = db_interface.query_table(meta_table,
                                        columns_queried=['T', 'T0', 'k', 'gamma', 'var_direct_method', 'k_var', 'T_var',
                                                         'soft_thresholding', 'start_time', 'aggregation_method',
                                                         'agg_interval', 'persist_l','col_to_row_ratio', 'L','last_TS_fullSVD','last_TS_inc',
                                                              'last_TS_seen', 'p' ,'time_series_table_name', 'indexed_column','time_column'])
    
    T, T0, k, gamma, direct_var, k_var, T_var, SSVT, start_time, aggregation_method, agg_interval, persist_l, col_to_row_ratio, L, ReconIndex, MUpdateIndex, TimeSeriesIndex , p= meta_inf[0][:-3]
    L_m = db_interface.query_table(index_name + "_m", ['L'], 'modelno =0')[0][0]
    
    time_series_table_name, value_column, time_column = meta_inf[0][-3:]
    last = get_bound_time(db_interface, time_series_table_name, time_column ,'max')
    value_columns = value_column.split(',')
    # ------------------------------------------------------
    # temp fix
    gamma = float(gamma)
    if not isinstance(start_time, (int, np.integer)):
        start_time = pd.to_datetime(start_time)
    if not isinstance(last, (int, np.integer)):
        last = pd.to_datetime(start_time)
    agg_interval = float(agg_interval)
    # ------------------------------------------------------
    no_ts = len(value_columns)
    last_index = (index_ts_mapper(start_time, agg_interval, last) + 1)
    if last_index - MUpdateIndex//no_ts <= 5*L_m:
        print(L, last_index, MUpdateIndex)
        print('nothing major to update')
        return False
    if p < 1.0:
        fill_in_missing = False
    else: fill_in_missing = True
    TSPD = TSPI(interface=db_interface, index_name=index_name, schema=None, T=T, T0=T0, rank=k, gamma=gamma,
                direct_var=direct_var, rank_var=k_var, T_var=T_var, SSVT=SSVT, start_time=start_time,
                aggregation_method=aggregation_method, agg_interval=agg_interval, time_series_table_name=time_series_table_name, 
                time_column = time_column, value_column = value_columns ,persist_L = persist_l,col_to_row_ratio = col_to_row_ratio, fill_in_missing = fill_in_missing, p =p)
    
    model_no = int(max((last_index*no_ts - 1) / (T / 2) - 1, 0))
    last_model_no = int(max((MUpdateIndex - 1) / (T / 2) - 1, 0))
    model_start = last_model_no*T/2
    print(model_no, last_model_no, ReconIndex, model_start, last_index)
    
    new_points_ratio = (last_index*no_ts - ReconIndex)/(ReconIndex - model_start)
    print(new_points_ratio)
    
    if new_points_ratio < gamma and model_no <= last_model_no and (last_index*no_ts)%(T//2) != 0:
        print('marginal update')
        start = (MUpdateIndex)//TSPD.no_ts
        end = (TimeSeriesIndex - 1)//TSPD.no_ts
    else:
        print('big update')
        start = max((TimeSeriesIndex - T)//TSPD.no_ts,0)
        end = (TimeSeriesIndex - 1)//TSPD.no_ts
    # initiate TSPI object 
    TSPD.ts_model = TSMM(TSPD.k, TSPD.T, TSPD.gamma, TSPD.T0, col_to_row_ratio=col_to_row_ratio,
                         model_table_name=index_name, SSVT=TSPD.SSVT, L=L, persist_L = TSPD.persist_L, no_ts = TSPD.no_ts, fill_in_missing = fill_in_missing, p =p)
    TSPD.ts_model.ReconIndex, TSPD.ts_model.MUpdateIndex, TSPD.ts_model.TimeSeriesIndex = ReconIndex, MUpdateIndex, TimeSeriesIndex

    # load variance models if any
    if TSPD.k_var != 0:
        col_to_row_ratio, L, ReconIndex, MUpdateIndex, TimeSeriesIndex = db_interface.query_table(meta_table,
                                                                                                  columns_queried=[
                                                                                                      'col_to_row_ratio_var',
                                                                                                      'L_var',
                                                                                                      'last_TS_fullSVD_var',
                                                                                                      'last_TS_inc_var',
                                                                                                      'last_TS_seen_var'])[0]

        TSPD.var_model = TSMM(TSPD.k_var, TSPD.T_var, TSPD.gamma, TSPD.T0, col_to_row_ratio=col_to_row_ratio,
                              model_table_name=index_name + "_variance", SSVT=TSPD.SSVT, L=L, persist_L =TSPD.persist_L, no_ts = TSPD.no_ts, fill_in_missing = fill_in_missing, p =p)
        TSPD.var_model.ReconIndex, TSPD.var_model.MUpdateIndex, TSPD.var_model.TimeSeriesIndex = ReconIndex, MUpdateIndex, TimeSeriesIndex

    print('loading meta_model time', time.time()-t)
    # LOADING SUB-MODELs Information
    TSPD._load_models_from_db(TSPD.ts_model)
    print('loading sub models time', time.time()-t)
    if end >= start:
        start_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, start)
        end_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, end)
        TSPD.ts_model.TimeSeries = TSPD._get_range(start_point, end_point)
        print('loading time series time', time.time()-t)
        print(start, end, start_point,end_point)
    # query variance models table
    if TSPD.k_var != 0:
        TSPD._load_models_from_db(TSPD.var_model)

        # load last T points of  variance time series (squared of observations if not direct_var)
        if TSPD.direct_var:
            end_var = (TSPD.var_model.TimeSeriesIndex - 1)//TSPD.no_ts
            start = max(start -1,0)
            TT = min(end_var-start+1, TSPD.var_model.T//TSPD.no_ts)
            if (end_var-start+1) - TT >0:
                start +=  (end_var-start+1) - TT 
            mean = np.zeros([TT,TSPD.no_ts])
            print(mean.shape, start, end_var, TSPD.var_model.T )
            start_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, start)
            end_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, end_var)
            print(start, end_var, start_point,end_point,TT)
            if end_var != start:
                for ts_n, value_column in enumerate(TSPD.value_column):
                    mean[:,ts_n] = get_prediction_range(index_name, TSPD.time_series_table_name, value_column,db_interface, start_point, end_point, uq=False)
                TSPD.var_model.TimeSeries = TSPD.ts_model.TimeSeries[:len(mean),:] - mean
        else:
            TSPD.var_model.TimeSeries = (TSPD.ts_model.TimeSeries) ** 2
    print('loading time series variance time', time.time()-t)
    return TSPD
示例#5
0
    def write_model(self, create):
        """
        write the pindex to db
        ----------
        Parameters
        ----------
        create: bol 
            if Ture, create the index in DB, else update it.
        """

        # remove schema name if exist
        t = time.time()
        index_name = self.index_name.split('.')[1]

        # delete meta data if create
        if create:
            delete_pindex(self.db_interface, index_name)
    
        # write mean and variance tables
        self.write_tsmm_model(self.ts_model, create)
        self.write_tsmm_model(self.var_model, create)
        self.calculate_out_of_sample_error(self.ts_model)
        # if time is timestamp, convert to pd.Timestamp
        if not isinstance(self.start_time, (int, np.integer)):
            self.start_time = pd.to_datetime(self.start_time)

        # prepare meta data table
        metadf = pd.DataFrame(
            data={'T': [self.ts_model.T], 'T0': [self.T0], 'gamma': [float(self.gamma)], 'k': [self.k],
                  'L': [self.ts_model.L],
                  'last_TS_seen': [self.ts_model.TimeSeriesIndex], 'last_TS_inc': [self.ts_model.MUpdateIndex],
                  'last_TS_fullSVD': [self.ts_model.ReconIndex],
                  'time_series_table_name': [self.time_series_table_name], 'indexed_column': [','.join(self.value_column)],
                  'time_column': [self.time_column],
                  'soft_thresholding': [self.SSVT], 'no_submodels': [len(self.ts_model.models)],
                  'no_submodels_var': [len(self.var_model.models)],
                  'col_to_row_ratio': [self.ts_model.col_to_row_ratio],
                  'col_to_row_ratio_var': [self.var_model.col_to_row_ratio], 'T_var': [self.var_model.T],
                  'k_var': [self.k_var], 'L_var': [self.var_model.L],
                  'last_TS_seen_var': [self.var_model.TimeSeriesIndex],
                  'last_TS_inc_var': [self.var_model.MUpdateIndex], 'aggregation_method': [self.aggregation_method],
                  'agg_interval': [self.agg_interval],
                  'start_time': [self.start_time], 'last_TS_fullSVD_var': [self.var_model.ReconIndex],
                  'var_direct_method': [self.direct_var], 'persist_l': [self.persist_L], 'p': [self.ts_model.p]})
        
        # ------------------------------------------------------
        # EDIT: Due to some incompatibiliy with PSQL timestamp types 
        # Further investigate 
        # ------------------------------------------------------
        if not isinstance(self.start_time, (int, np.integer)):
            #metadf['start_time'] = metadf['start_time'].astype(pd.Timestamp)
            metadf['start_time'] = metadf['start_time'].astype('datetime64[ns]')
        last_index = index_ts_inv_mapper(self.start_time, self.agg_interval, self.ts_model.TimeSeriesIndex//self.no_ts -1)
        if create:
            # create meta table
            self.db_interface.create_table(self.index_name + '_meta', metadf, include_index=False)
            
            # populate column pindices
            for i,ts in enumerate(self.value_column):
                self.db_interface.insert('tspdb.pindices_columns', [index_name, ts],
                                     columns=['index_name', 'value_column'])

        else:
            # else update meta table, tspdb pindices 
            self.db_interface.delete(self.index_name + '_meta', '')
            self.db_interface.insert(self.index_name + '_meta', metadf.iloc[0])
            self.db_interface.delete('tspdb.pindices', "index_name = '" + str(index_name) + "';")
            self.db_interface.delete('tspdb.pindices_stats', "index_name = '" + str(index_name) + "';")
            
            # UPDATE STAT TABLE
        for i,ts in enumerate(self.value_column):
            forecast_tests_array = np.array([m.forecast_model_score_test[i] for m in self.ts_model.models.values()],'float')
            self.db_interface.insert('tspdb.pindices_stats',
                                     [index_name, ts, self.ts_model.TimeSeriesIndex//self.no_ts, len(self.ts_model.models),np.mean([ m.imputation_model_score[i] for m in self.ts_model.models.values() ]), np.mean([ m.forecast_model_score[i] for m in self.ts_model.models.values()]),np.nanmean(forecast_tests_array)],
                                     columns=['index_name', 'column_name','number_of_observations', 'number_of_trained_models', 'imputation_score', 'forecast_score','test_forecast_score'])
            
            # UPDATE PINDICES TABLE
        if isinstance(self.start_time, (int, np.integer)):
            self.db_interface.insert('tspdb.pindices',
                                     [index_name, self.time_series_table_name, self.time_column, self.uq,
                                      self.agg_interval, self.start_time, last_index],
                                     columns=['index_name', 'relation', 'time_column', 'uq', 'agg_interval',
                                              'initial_index', 'last_index'])
        else:
            self.db_interface.insert('tspdb.pindices',
                                     [index_name, self.time_series_table_name, self.time_column, self.uq,
                                      self.agg_interval, self.start_time, last_index],
                                     columns=['index_name', 'relation', 'time_column', 'uq', 'agg_interval',
                                              'initial_timestamp', 'last_timestamp'])
示例#6
0
def _get_forecast_range(index_name,table_name, value_column, index_col, interface, t1, t2,MUpdateIndex,L,k,T,last_model, interval, start_ts, last_TS_seen,no_ts, value_index,direct_var = False,variance = False,averaging = 'average', projected = False,p = 1.0):
    """
    Return the forecasted value in the past at the time range t1 to t2 for the value of column_name using index_name 
    ----------
    Parameters
    ----------
    index_name: string 
        name of the PINDEX used to query the prediction

    index_name: table_name 
        name of the time series table in the database

    value_column: string
        name of column than contain time series value

    index_col: string  
        name of column that contains time series index/timestamp

    interface: db_class object
        object used to communicate with the DB. see ../database/db_class for the abstract class
    
    t1: (int or timestamp)
        index or timestamp indicating the start of the queried range 
    
    t2: (int or timestamp)
        index or timestamp indicating the end of the queried range  
    
    L: (int)
        Model parameter determining the number of rows in each matrix in a sub model. 
    
    k: (int )
        Model parameter determining the number of retained singular values in each matrix in a sub model. 
    
    T: (int )
        Model parameter determining the number of datapoints in each matrix in a sub model.
    
    last_model: (int )
        The index of the last sub model

    averaging: string, optional, (default 'average')
        Coefficients used when forecasting, 'average' means use the average of all sub models coeffcients. 
    ----------
    Returns
    ----------
    prediction  array, shape [(t1 - t2 +1)  ]
        forecasted value of the time series  in the range [t1,t2]  using index_name
    """
    ############### EDITS ##################
    #1- Replace last_ts with the last time stamp seen 
    ########################################
    # get coefficients
    coeffs = np.array(interface.get_coeff(index_name + '_c_view', averaging))
    coeffs_ts = coeffs[-no_ts:]
    coeffs = coeffs[:-no_ts]
    no_coeff = len(coeffs)
 
    if not direct_var or not variance:
            if projected:
                if last_model != 0:
                    q_model = last_model- 1
                else:
                    q_model = last_model
                U = interface.get_U_row(index_name + '_u', [0, 2 * L], [q_model, q_model], k,
                                             return_modelno=False,return_weights_decom=True)[:-1,k:]
                no_coeff = U.shape[0]
                projection_matrix = np.dot(U,U.T)
            
            agg_interval = float(interval)
            if not isinstance(start_ts, (int, np.integer)):
                start_ts = pd.Timestamp(start_ts)
            # if the range queries is beyond what we have so far, get the last point seen
            last_TS_seen = get_bound_time(interface, table_name, index_col, 'max')
            if not isinstance(last_TS_seen, (int, np.integer)):
                last_TS_seen = index_ts_mapper(start_ts, agg_interval, last_TS_seen)
            last_TS_seen+=1
            print(t1,t2, last_TS_seen)
            
            t1_ = min(t1, last_TS_seen)
            t2_ = min(t2, last_TS_seen)
            end = index_ts_inv_mapper(start_ts, agg_interval, t1_ - 1 )
            start = index_ts_inv_mapper(start_ts, agg_interval, t1_ - no_coeff  )
            print(start, end)
            obs = interface.get_time_series(table_name, start, end, start_ts = start_ts,  value_column=value_column, index_column= index_col, Desc=False, interval = agg_interval, aggregation_method =  averaging)
            output = np.zeros([t2 - t1_ + 1 ])
            obs = np.array(obs)[-no_coeff:,0]
            print(len(obs[:]), no_coeff)
            # Fill using fill_method
            if p <1:
                obs = np.array(pd.DataFrame(obs).fillna(value = 0).values[:,0])
                obs /= p
            else:
                obs = np.array(pd.DataFrame(obs).fillna(method = 'ffill').values[:,0])
                obs = np.array(pd.DataFrame(obs).fillna(method = 'bfill').values[:,0])
            if variance:
                obs = obs **2
            observations = np.zeros([t2 - t1_ + 1 + no_coeff])
            observations[:no_coeff] = obs
            
            for i in range(0, t2 + 1 - t1_): 
                    if i  < len(obs):
                        if projected:
                            output[i] = np.dot(coeffs.T, np.dot(projection_matrix, observations[i:i + no_coeff]))+coeffs_ts[value_index]
                        else:
                            output[i] = np.dot(coeffs.T,  observations[i:i + no_coeff])+coeffs_ts[value_index]
                    else:
                        output[i] = np.dot(coeffs.T,  observations[i:i + no_coeff])+coeffs_ts[value_index]
                    if i+no_coeff >= len(obs):
                        observations[i+no_coeff] = output[i]

            return output[-(t2 - t1 + 1):]
            
    # the forecast should always start at the last point
    t1_ = MUpdateIndex//no_ts 
    output = np.zeros([t2 - t1_ + 1 + no_coeff])
    output[:no_coeff] = _get_imputation_range(index_name, table_name, value_column, index_col, interface, t1_ - no_coeff, t1_ - 1, L,k,T,last_model,value_index, no_ts)
    for i in range(0, t2 + 1 - t1_):
        output[i + no_coeff] = np.dot(coeffs.T, output[i:i + no_coeff])+coeffs_ts[value_index]
    return output[-(t2 - t1 + 1):]