Пример #1
0
    def core(self, input_ds):
        """ Performs sliding-window mssa_decomposition and prediction of each input feature. """
        # get the size of the input dataset, try if there are more than one column, else, assign number of columns as 1
        try:
            self.rows_d, self.cols_d = input_ds.shape
        except:
            (self.rows_d,) = input_ds.shape
            self.cols_d = 1
            input_ds = input_ds.reshape(self.rows_d, self.cols_d)
        if self.conf.window_size > self.rows_d // 5:
            print("The window_size must be at maximum 1/5th of the rows of the input dataset")
            sys.exit()
        # create an empty array with the estimated output shape
        self.output_ds = np.empty(shape=(self.rows_d-(self.conf.window_size), self.cols_d))
        
        # center the input_ds before fitting
        in_means = np.nanmean(input_ds, axis=0)
        input_ds = input_ds - in_means

        # calculate the output by performing MSSA on <segments> number of windows of data of size window_size
        segments = (self.rows_d - (2*self.conf.window_size + self.conf.forward_ticks))
        grouped_output = []
        for i in range(0, segments):
            #progress = i*100/segments
            #print("Segment: ",i,"/",segments, "     Progress: ", progress," %" )
            # verify if i+(2*self.conf.window_size) is the last observation
            first = i 
            if (i != segments-1):
                last = i + (2 * self.conf.window_size)
            else:
                last = self.rows_d
            # slice the input_ds dataset in 2*self.conf.window_size ticks segments
            s_data_w = input_ds[first : last,:]
            # center the data before fitting
            # only the first time, run svht, in following iterations, use the same n_components, without executing the svht algo
            if i == 0:
                # uses SVHT for selecting number of components if required from the conf parameters
                if self.conf.num_components == 0:
                    mssa = MSSA(n_components='svht', window_size=self.conf.window_size, verbose=False)
                    mssa.fit(s_data_w)
                    print("Automatically Selected Rank (number of components)= ",str(mssa.rank_))
                    rank = int(mssa.rank_)
                else:
                    rank = self.conf.num_components
                    mssa = MSSA(n_components=rank, window_size=self.conf.window_size, verbose=False)
                    mssa.fit(s_data_w)
            else:
                mssa = MSSA(n_components=rank, window_size=self.conf.window_size, verbose=False)
                mssa.fit(s_data_w)

            # TODO : Con las componentes, generar la predicción y luego los plots para cada feature del input_ds
            fc = mssa.forecast(self.conf.forward_ticks, timeseries_indices=None)        
            
            # extracts the required tick from prediction for each feature in fc_col
            fc_col = fc[:,self.conf.forward_ticks-1]
            (rows_o,) = fc_col.shape
            # transpose the predictions into a row 
            fc_row = fc_col.reshape(1,rows_o)
            # extract the row of components for all features into a single column
            comp_col = mssa.components_[:,(2 * self.conf.window_size) -1 , :].sum(axis=1)
            (rows_o,) = comp_col.shape
            # transpose the sum of channels per feature into a row
            comp_row = comp_col.reshape(1,rows_o)
            
            
            # concatenate otput array with the new predictions (5 tick fw) and the component sum (last tick in segment before prediction) in another array for plotting
            if i == 0:
                self.output_ds = fc_row
                denoised = comp_row                
            else:
                self.output_ds = np.concatenate((self.output_ds, fc_row), axis = 0)
                denoised = np.concatenate((denoised, comp_row), axis = 0)
            # TODO: calculate error per feature
        # calcluate shape of output_ds
        try:
            rows_o, cols_o = self.output_ds.shape
        except:
            (rows_o,) = self.output_ds.shape
            cols_o = 1
            self.output_ds = self.output_ds.reshape(rows_o, cols_o)

        # calculate error on the last half of the input dataset
        #r2 = r2_score(input_ds[(2 * self.conf.window_size) + self.conf.forward_ticks-1 : self.rows_d-self.conf.forward_ticks-1, feature], self.output_ds[:rows_o-self.conf.forward_ticks, feature])
        #r2 = r2_score(input_ds[(2 * self.conf.window_size) + self.conf.forward_ticks-1 + (self.rows_d//2): self.rows_d-self.conf.forward_ticks-1, 0], self.output_ds[(self.rows_d//2):rows_o-self.conf.forward_ticks, 0])
        r2 = r2_score(input_ds[(self.rows_d-self.conf.forward_ticks-1)-(self.rows_d//2): self.rows_d-self.conf.forward_ticks-1, 0], self.output_ds[(rows_o-self.conf.forward_ticks)-(self.rows_d//2) :rows_o-self.conf.forward_ticks, 0])
        mse = mean_squared_error(input_ds[(self.rows_d-self.conf.forward_ticks-1)-(self.rows_d//2): self.rows_d-self.conf.forward_ticks-1, 0], self.output_ds[(rows_o-self.conf.forward_ticks)-(self.rows_d//2) :rows_o-self.conf.forward_ticks, 0])
        mae = mean_absolute_error(input_ds[(self.rows_d-self.conf.forward_ticks-1)-(self.rows_d//2): self.rows_d-self.conf.forward_ticks-1, 0], self.output_ds[(rows_o-self.conf.forward_ticks)-(self.rows_d//2) :rows_o-self.conf.forward_ticks, 0])
        self.error = r2
        # plots th original data, predicted data and denoised data.
        if self.conf.plot_prefix != None:
            # Graficar matriz de correlaciones del primero y  agrupar aditivamente los mas correlated.
            # genera gráficas para cada componente con valores agrupados
            # for the 5th and the next components, save plots containing the original and cummulative timeseries for the first data column
            # TODO: QUITAR CUANDO DE HAGA PARA TODO SEGMENTO EN EL DATASET; NO SOLO EL PRIMERO
            # TODO : QUITAR: TEST de tamaño de grouped_components_ dictionary
            feature = 0
            for feature in range(self.cols_d):
                fig, ax = plt.subplots(figsize=(18, 7))
                ax.plot(self.output_ds[:rows_o-self.conf.forward_ticks, feature], lw=3, c='steelblue', alpha=0.8, label='predicted')
                ax.plot(denoised[self.conf.forward_ticks:, feature], lw=3, c='darkgoldenrod', alpha=0.6, label='denoised')
                ax.plot(input_ds[(2 * self.conf.window_size) + self.conf.forward_ticks-1 : self.rows_d-self.conf.forward_ticks-1, feature], lw=3, alpha=0.2, c='k', label='original') 
                ax.set_title('Forecast R2 = {:.3f}   MSE = {:.3f}   MAE = {:.3f}'.format(r2,mse,mae))
                ax.legend() 
                fig.savefig(self.conf.plot_prefix + str(feature) + '.png', dpi=600)

        # shows error
        if self.conf.show_error == True:
            for feature in range(self.cols_d):
                print("Feature = ", str(feature), "R2 score = ", str(r2))
        return self.output_ds
    def perform_mssa(self, fVectors, n_components):
        L = 150  # Length of the time window
        mssa = MSSA(n_components='variance_threshold',
                    variance_explained_threshold=0.99,
                    window_size=L,
                    verbose=True)
        mssa.fit(fVectors)
        idx = 3
        indexes = np.arange(mssa.components_.shape[1])
        '''for comp in range(10):
            fig, ax = plt.subplots(figsize=(18, 7))
            ax.plot(indexes, fVectors[:, idx], lw=3, alpha=0.2, c='k',
                    label="program 3")
            ax.plot(indexes, mssa.components_[idx, :, comp], lw=3, c='steelblue', alpha=0.8,
                    label='component={}'.format(comp))
            ax.legend()
            plt.show()'''

        base_dir = "./results/test3"
        self.create_directory(base_dir)

        for idx in [-1, -2, -3]:
            self.create_directory(base_dir + "/program{}".format(idx))
            cumulative_recon = np.zeros_like(fVectors[:, idx])
            for comp in range(mssa.components_.shape[2]):
                fig, ax = plt.subplots(figsize=(18, 7))
                current_component = mssa.components_[idx, :, comp]
                cumulative_recon = cumulative_recon + current_component

                ax.plot(indexes,
                        fVectors[:, idx],
                        lw=3,
                        alpha=0.2,
                        c='k',
                        label="program 3")
                ax.plot(indexes,
                        cumulative_recon,
                        lw=3,
                        c='darkgoldenrod',
                        alpha=0.6,
                        label='cumulative'.format(comp))
                ax.plot(indexes,
                        current_component,
                        lw=3,
                        c='steelblue',
                        alpha=0.8,
                        label='component={}'.format(comp))

                ax.legend()
                plt.savefig(
                    "results/test3/program{}/cumulation_of_{}_components_for_index{}_2"
                    .format(idx, comp, idx))
                plt.show()

        print(mssa.component_ranks_[0:10])
        print(mssa.component_ranks_explained_variance_[0:10])

        total_comps = mssa.components_[0, :, :]
        print(total_comps.shape)

        total_wcorr = mssa.w_correlation(total_comps)
        total_wcorr_abs = np.abs(total_wcorr)
        fig, ax = plt.subplots(figsize=(12, 9))
        sns.heatmap(np.abs(total_wcorr_abs), cmap='coolwarm', ax=ax)
        ax.set_title('component w-correlations')

        plt.show()
        plt.savefig("results/test3/correlation_matrix")
        print(mssa.component_ranks_.shape)
        return mssa.component_ranks_.T
Пример #3
0
    segments = (num_ticks // (2 * p_window_size))

    for i in range(0, segments):
        # verify if i+(2*p_window_size) is the last observation
        first = i * (2 * p_window_size)
        if (i != segments - 1):
            last = (i + 1) * (2 * p_window_size)
        else:
            last = num_ticks
        # slice the data in 2*p_window_size ticks segments
        s_data_w = s_data[first:last, :]
        # only the first time, run svht, in following iterations, use the same n_components, without executing the svht algo

        if i == 0:
            mssa = MSSA(n_components='svht',
                        window_size=p_window_size,
                        verbose=True)
            mssa.fit(s_data_w)
            print("Selected Rank = ", str(mssa.rank_))
            #rank = int(mssa.rank_)
            rank = int(p_n_components)
        else:
            mssa = MSSA(n_components=rank,
                        window_size=p_window_size,
                        verbose=True)
            mssa.fit(s_data_w)
        # concatenate otput array with the new components
        if i == 0:
            output = copy.deepcopy(mssa.components_)
        else:
            np.concatenate((output, mssa.components_), axis=1)
Пример #4
0
    def core(self, input_ds):
        """ Performs mssa_decomposition. """
        # get the size of the input dataset
        self.rows_d, self.cols_d = input_ds.shape
        # create an empty array with the estimated output shape
        self.output_ds = np.empty(shape=(self.rows_d-self.conf.window_size, 1))
        # calculate the output by performing MSSA on <segments> number of windows of data of size window_size
        segments = (self.rows_d // (2*self.conf.window_size))
        for i in range(0, segments):
            # verify if i+(2*self.conf.window_size) is the last observation
            first = i * (2 * self.conf.window_size)
            if (i != segments-1):
                last = (i+1) * (2 * self.conf.window_size)
            else:
                last = self.rows_d
            # slice the input_ds dataset in 2*self.conf.window_size ticks segments
            s_data_w = input_ds[first : last,:]       
            # only the first time, run svht, in following iterations, use the same n_components, without executing the svht algo
            if i == 0: 
                # uses SVHT for selecting number of components if required from the conf parameters
                if self.conf.num_components == 0:
                    mssa = MSSA(n_components='svht', window_size=self.conf.window_size, verbose=True)
                    mssa.fit(s_data_w)
                    print("Automatically Selected Rank (number of components)= ",str(mssa.rank_))
                    rank = int(mssa.rank_)
                else:
                    rank = self.conf.num_components
                    mssa = MSSA(n_components=rank, window_size=self.conf.window_size, verbose=True)
                    mssa.fit(s_data_w)
            else:
                mssa = MSSA(n_components=rank, window_size=self.conf.window_size, verbose=True)
                mssa.fit(s_data_w)

            # concatenate otput array with the new components
            if i == 0:
                output_ds = copy.deepcopy(mssa.components_)
            else:
                np.concatenate((output_ds, mssa.components_), axis = 1)
                
            #TODO: concatenate grouped output 
            print("Grouping correlated components (manually set list)") 
            # use the same groups for all the features
            # load the groups from a json file
            grouped_output = []
            if self.conf.group_file != None:
                # TODO: QUITAR GUARDADO DE JSON DE EJEMPLO
                ts0_groups = [[0],[1],[2],[3],[4,5],[6],[7],[8],[9,10],[11],[12]]
                with open(self.conf.group_file, 'w') as f:
                    json.dump(ts0_groups, f)
                with open(self.conf.group_file) as json_file:
                    ts0_groups = json.load(json_file)
                for j in range(0, self.cols_d):
                    # draw correlation matrix for the first segment
                    mssa.set_ts_component_groups(j, ts0_groups)
                    ts0_grouped = mssa.grouped_components_[j]
                    # concatenate otput array with the new components
                    if i == 0:
                        grouped_output.append(copy.deepcopy(mssa.grouped_components_[j]))
                    else:
                        grouped_output[j] = np.concatenate((grouped_output[j], copy.deepcopy(mssa.grouped_components_[j])), axis = 0)
                    # save the correlation matrix only for the first segment
                    if (i == 0) and (self.conf.plot_correlations != None):
                        # save grouped component correlation matrix
                        ts0_grouped_wcor = mssa.w_correlation(ts0_grouped)
                        fig, ax = plt.subplots(figsize=(12,9))
                        sns.heatmap(np.abs(ts0_grouped_wcor), cmap='coolwarm', ax=ax)
                        ax.set_title('grouped component w-correlations')
                        fig.savefig(self.conf.plot_correlations + str(j) + 'grouped.png', dpi=200)
                self.output_ds = grouped_output
            else:
                grouped_output = self.output_ds
        # show progress
        progress = i*100/segments
        print("Segment: ",i,"/",segments, "     Progress: ", progress," %" )
        if self.conf.plot_prefix != None:
            # Graficar matriz de correlaciones del primero y  agrupar aditivamente los mas correlated.
            # genera gráficas para cada componente con valores agrupados
            # for the 5th and the next components, save plots containing the original and cummulative timeseries for the first data column 
            # TODO: QUITAR CUANDO DE HAGA PARA TODO SEGMENTO EN EL DATASET; NO SOLO EL PRIMERO
            cumulative_recon = np.zeros_like(s_data[:, 0])
            # TODO : QUITAR: TEST de tamaño de grouped_components_ dictionary
            for comp in range(len(grouped_output[0][0])):
                fig, ax = plt.subplots(figsize=(18, 7))
                current_component = grouped_output[0][:, comp]
                cumulative_recon = cumulative_recon + current_component
                ax.plot(s_data[:, 0], lw=3, alpha=0.2, c='k', label='original')
                ax.plot(cumulative_recon, lw=3, c='darkgoldenrod', alpha=0.6, label='cumulative'.format(comp))
                ax.plot(current_component, lw=3, c='steelblue', alpha=0.8, label='component={}'.format(comp))
                ax.legend()
                fig.savefig(self.conf.plot_prefix + '_' + str(comp) + '.png', dpi=600)


        return self.output_ds
Пример #5
0
    def core(self, input_ds):
        """ Performs mssa_decomposition. """
        # get the size of the input dataset, try if there are more than one column, else, assign number of columns as 1
        try:
            self.rows_d, self.cols_d = input_ds.shape
        except:
            (self.rows_d, ) = input_ds.shape
            self.cols_d = 1
            input_ds = input_ds.reshape(self.rows_d, self.cols_d)
        # create an empty array with the estimated output shape
        self.output_ds = np.empty(shape=(self.rows_d - self.conf.window_size,
                                         self.cols_d))

        # center the input_ds before fitting
        in_means = np.nanmean(input_ds, axis=0)
        input_ds = input_ds - in_means

        # calculate the output by performing MSSA on <segments> number of windows of data of size window_size
        segments = (self.rows_d // (2 * self.conf.window_size))
        grouped_output = []
        for i in range(0, segments):
            # verify if i+(2*self.conf.window_size) is the last observation
            first = i * (2 * self.conf.window_size)
            if (i != segments - 1):
                last = (i + 1) * (2 * self.conf.window_size)
            else:
                last = self.rows_d
            # slice the input_ds dataset in 2*self.conf.window_size ticks segments
            s_data_w = input_ds[first:last, :]
            # only the first time, run svht, in following iterations, use the same n_components, without executing the svht algo
            if i == 0:
                # uses SVHT for selecting number of components if required from the conf parameters
                if self.conf.num_components == 0:
                    mssa = MSSA(n_components='svht',
                                window_size=self.conf.window_size,
                                verbose=True)
                    mssa.fit(s_data_w)
                    print(
                        "Automatically Selected Rank (number of components)= ",
                        str(mssa.rank_))
                    rank = int(mssa.rank_)
                else:
                    rank = self.conf.num_components
                    mssa = MSSA(n_components=rank,
                                window_size=self.conf.window_size,
                                verbose=True)
                    mssa.fit(s_data_w)
            else:
                mssa = MSSA(n_components=rank,
                            window_size=self.conf.window_size,
                            verbose=True)
                mssa.fit(s_data_w)

            # concatenate otput array with the new components
            if i == 0:
                if self.conf.group_file == None:
                    self.output_ds = np.array(mssa.components_)
            else:
                if self.conf.group_file == None:
                    self.output_ds = np.concatenate(
                        (self.output_ds, mssa.components_), axis=1)

            # load the groups from a json file, use the same groups for all the features
            if self.conf.group_file != None:
                print("Grouping correlated components (manually set list)")
                with open(self.conf.group_file) as json_file:
                    ts0_groups = json.load(json_file)
                for j in range(0, self.cols_d):
                    # draw correlation matrix for the first segment
                    mssa.set_ts_component_groups(j, ts0_groups)
                    ts0_grouped = mssa.grouped_components_[j]
                    # concatenate otput array with the new components
                    if i == 0:
                        grouped_output.append(
                            copy.deepcopy(mssa.grouped_components_[j]))
                    else:
                        grouped_output[j] = np.concatenate(
                            (grouped_output[j],
                             copy.deepcopy(mssa.grouped_components_[j])),
                            axis=0)
                    # save the correlation matrix only for the first segment
                    if (i == 0) and (self.conf.w_prefix != None):
                        # save grouped component correlation matrix
                        ts0_grouped_wcor = mssa.w_correlation(ts0_grouped)
                        fig, ax = plt.subplots(figsize=(12, 9))
                        sns.heatmap(np.abs(ts0_grouped_wcor),
                                    cmap='coolwarm',
                                    ax=ax)
                        ax.set_title('grouped component w-correlations')
                        fig.savefig(self.conf.w_prefix + str(j) +
                                    '_grouped.png',
                                    dpi=200)
                self.output_ds = np.array(grouped_output)
            else:
                # save the correlation matrix only for the first segment
                for j in range(0, self.cols_d):
                    if (i == 0) and (self.conf.w_prefix != None):
                        total_comps = mssa.components_[j, :, :]
                        # save grouped component correlation matrix
                        ts0_wcor = mssa.w_correlation(total_comps)
                        fig, ax = plt.subplots(figsize=(12, 9))
                        sns.heatmap(np.abs(ts0_wcor), cmap='coolwarm', ax=ax)
                        ax.set_title('component w-correlations')
                        fig.savefig(self.conf.w_prefix + str(j) + '.png',
                                    dpi=200)
                grouped_output = self.output_ds.tolist()
        # show progress
        # save the correlation matrix only for the first segment
        if (i == 0) and (self.conf.w_prefix != None):
            # save grouped component correlation matrix
            ts0_grouped_wcor = mssa.w_correlation(ts0_grouped)
            fig, ax = plt.subplots(figsize=(12, 9))
            sns.heatmap(np.abs(ts0_grouped_wcor), cmap='coolwarm', ax=ax)
            ax.set_title('grouped component w-correlations')
            fig.savefig(self.conf.w_prefix + str(j) + '.png', dpi=200)
        progress = i * 100 / segments
        print("Segment: ", i, "/", segments, "     Progress: ", progress, " %")
        if self.conf.plot_prefix != None:
            # Graficar matriz de correlaciones del primero y  agrupar aditivamente los mas correlated.
            # genera gráficas para cada componente con valores agrupados
            # for the 5th and the next components, save plots containing the original and cummulative timeseries for the first data column
            cumulative_recon = np.zeros_like(input_ds[:, 0])
            for comp in range(len(grouped_output[0][0])):
                fig, ax = plt.subplots(figsize=(18, 7))
                current_component = self.output_ds[0, :, comp]
                cumulative_recon = cumulative_recon + current_component
                ax.plot(input_ds[:, 0],
                        lw=3,
                        alpha=0.2,
                        c='k',
                        label='original')
                ax.plot(cumulative_recon,
                        lw=3,
                        c='darkgoldenrod',
                        alpha=0.6,
                        label='cumulative'.format(comp))
                ax.plot(current_component,
                        lw=3,
                        c='steelblue',
                        alpha=0.8,
                        label='component={}'.format(comp))
                ax.legend()
                fig.savefig(self.conf.plot_prefix + '_' + str(comp) + '.png',
                            dpi=600)
        print("pre self.output_ds.shape = ", self.output_ds.shape)

        # transforms the dimensions from (features, ticks, channels) to (ticks, feats*channels)
        ns_output = []
        for n in range(self.output_ds.shape[1]):
            row = []
            for p in range(self.output_ds.shape[0]):
                for c in range(self.output_ds.shape[2]):
                    #row.append(self.output_ds[p,n,c])
                    row.append(self.output_ds[p, n, c])
            ns_output.append(row)
        # convert to np array
        self.output_ds = np.array(ns_output)
        print("new self.output_ds.shape = ", self.output_ds.shape)
        return self.output_ds
Пример #6
0
pca.fit(df_small)
fit = pca.fit(df_small)
trans = pca.fit_transform(df_small)
#_df.adjclose.plot()
trans.iloc[:, 0].plot()
plt.show()
print(fit.column_correlations(df_small))




from pyts.decomposition import SingularSpectrumAnalysis
from pymssa import MSSA

window_size = 20
groups = [np.arange(i, i+5) for i in range(0, 20, 5)]

ssa = SingularSpectrumAnalysis(window_size= window_size)
X_ssa = ssa.fit_transform(df_small)

mssa = MSSA(n_components=5,
            window_size=21,
            verbose=True)
mssa.fit(_df.adjclose)

pd.DataFrame(mssa.components_[0,:,:], index=_df.index).plot()
plt.show()

_df.adjclose.plot()
plt.show()
    def get_ssa(self, ncomp: int = None, wsize=60):
        model = MSSA(n_components=ncomp, window_size=wsize, verbose=True)
        model.fit(self.data)

        self.model_ssa = model