Exemplo n.º 1
0
 def set_pystarma_time_series(self, file_path):
     self.ts_matrix = Dataset()
     self.ts_matrix = pd.read_csv(file_path,
                                  header=0,
                                  lineterminator='\n',
                                  parse_dates=['date'],
                                  date_parser=lambda dates: pd.datetime.
                                  strptime(dates, '%Y-%m-%d %H:%M:%S'))
Exemplo n.º 2
0
	def __init__(self):
		super(Average_Week_Content,self).__init__()
		self.source_average_dataset_object = Dataset() # contains all sensors
		self.prediction_dataset_object = Dataset()
		self.subset_avg_week_array = None # subset of sensors specified by target_sensor_idxs_list
		self.target_begin_weekday_int = None
		self.num_target_rows = None
		self.num_target_sensors = None
		self.num_target_offsets = None
		self.input_target_maker = None
		self.weekday_int_source = None
Exemplo n.º 3
0
 def __init__(self):
     super(STARMA_Content, self).__init__()
     self.pystarma_model = None
     self.ts_matrix = Dataset()  # contains all sensors
     self.wa_matrices = None  # contains spatial weights
     self.ar = 0
     self.ma = 0
     self.lags = ''
     self.max_t_lag = 25
     self.sample_size = 0.1
     self.iterations = 2
     self.prediction_dataset_object = Dataset()
     self.num_target_rows = None
     self.num_target_sensors = None
     self.num_target_offsets = None
     self.input_target_maker = None
class Source_Maker_With_Single_Input_File(Source_Maker):
    def __init__(self):
        super(Source_Maker_With_Single_Input_File, self).__init__()
        self.file_path_all_data = None

    def read_data_from_csv(self):
        self.all_data = Dataset()
        self.all_data.read_csv(self.file_path_all_data, index_col=0)

    def apply_parameters(self):
        if self.moving_average_window is not None:
            self.all_data.rolling_average_with_window(
                self.moving_average_window)

    def get_all_sensors(self):
        return self.all_data.get_column_names()
Exemplo n.º 5
0
 def read_source_data(self, file_path):
     if self.is_sql_output:
         d = Dataset_From_SQL()
         d.read_csv(file_path)
         d.pivot()
     else:
         d = Dataset()
         d.read_csv(file_path, index_col=0)
     return d
Exemplo n.º 6
0
class Source_Maker_With_K_Fold_Validation(Source_Maker):
    def __init__(self):
        super(Source_Maker_With_K_Fold_Validation, self).__init__()
        self.file_path_all_data = None
        self.validation_percentage = None

    def read_data_from_csv(self):
        self.all_data = Dataset()
        self.all_data.read_csv(self.file_path_all_data, index_col=0)

    def apply_parameters(self):
        if self.moving_average_window is not None:
            self.all_data.rolling_average_with_window(
                self.moving_average_window)

    def remove_validation_data(self):
        self.calculate_train_test_validation_sizes()
        idxs_validation = range(0, self._sizes['validation'])
        idxs_all = list(range(self._sizes['all']))
        idxs_train_test = [i for i in idxs_all if i not in idxs_validation]
        self.validation = self.create_df_subset_with_idxs(idxs_validation)
        self.train_test = self.create_df_subset_with_idxs(idxs_train_test)

    def calculate_train_test_validation_sizes(self):
        size_all = self.all_data.get_number_rows()
        size_validation = int(size_all * (self.validation_percentage / 100))
        self._sizes = {'validation': size_validation, 'all': size_all}

    def create_df_subset_with_idxs(self, row_range):
        d = Dataset()
        d.df = pd.DataFrame(self.all_data.df.iloc[row_range, :])
        return d

    def get_all_sensors(self):
        return self.all_data.get_column_names()
Exemplo n.º 7
0
class Model_Output:
    def __init__(self):
        self.prediction_dataset_object = Dataset()
        self.target_dataset_object = Dataset()

    def set_prediction_dataset_object_with_numpy_array(self, numpy_array):
        self.prediction_dataset_object.set_numpy_array(numpy_array)
        self.prediction_dataset_object.df.columns = self.target_dataset_object.df.columns.values

    def set_target_dataset_object(self, dataset_object):
        self.target_dataset_object = dataset_object

    def get_prediction_df(self):
        self.prediction_dataset_object.df.index = self.target_dataset_object.df.index.values[
            0:self.prediction_dataset_object.df.shape[0]]
        return self.prediction_dataset_object.df

    def calc_mae(self):
        my_error = mae(self.prediction_dataset_object.df,
                       self.target_dataset_object.df)
        return my_error

    def calc_mape(self):
        return mape(self.prediction_dataset_object.df,
                    self.target_dataset_object.df)

    def make_target_and_predictions_df(self):
        array = np.concatenate((self.target_dataset_object.df.values,
                                self.prediction_dataset_object.df.values),
                               axis=1)
        new_df = pd.DataFrame(array)
        new_df.index = self.target_dataset_object.df.index.values
        names_target = [
            "target_%d" % x
            for x in self.target_dataset_object.df.columns.values
        ]
        names_predict = [
            "predict_%d" % x
            for x in self.prediction_dataset_object.df.columns.values
        ]
        print(self.target_dataset_object.index.values)
        print("IN MODEL OUTPUT")
        new_df.columns = names_target + names_predict
        return new_df

    def fill_time_gaps_in_target_and_predictions_using_time_format(
            self, time_format):
        self.target_dataset_object.fill_time_gaps(time_format)
        self.fill_time_gaps_in_predictions_using_time_format(time_format)

    def fill_time_gaps_in_predictions_using_time_format(self, time_format):
        print(self.prediction_dataset_object.df.shape)
        print(self.target_dataset_object.df.shape)
        self.prediction_dataset_object.df.index = self.target_dataset_object.df.index.values[
            0:self.prediction_dataset_object.df.shape[0]]
        self.prediction_dataset_object.fill_time_gaps(time_format)
Exemplo n.º 8
0
class STARMA_Content(Model_Content):
    def __init__(self):
        super(STARMA_Content, self).__init__()
        self.pystarma_model = None
        self.ts_matrix = Dataset()  # contains all sensors
        self.wa_matrices = None  # contains spatial weights
        self.ar = 0
        self.ma = 0
        self.lags = ''
        self.max_t_lag = 25
        self.sample_size = 0.1
        self.iterations = 2
        self.prediction_dataset_object = Dataset()
        self.num_target_rows = None
        self.num_target_sensors = None
        self.num_target_offsets = None
        self.input_target_maker = None

    def set_input_target_maker(self, input_target_maker, test=False):
        self.input_target_maker = input_target_maker
        self.num_target_sensors = len(
            input_target_maker.get_target_sensor_idxs_list())
        self.num_target_offsets = len(
            input_target_maker.get_target_time_offsets_list())
        self._set_parameters_from_input_target_maker(
        ) if test is False else logging.debug("Testing")

    def _set_parameters_from_input_target_maker(self):
        self.num_target_rows = self.input_target_maker.get_target_dataset_object(
        ).get_number_rows()

    def set_pystarma_time_series(self, file_path):
        self.ts_matrix = Dataset()
        self.ts_matrix = pd.read_csv(file_path,
                                     header=0,
                                     lineterminator='\n',
                                     parse_dates=['date'],
                                     date_parser=lambda dates: pd.datetime.
                                     strptime(dates, '%Y-%m-%d %H:%M:%S'))

    def set_pystarma_weight_matrices(self, file_path, file_names):
        for f in range(file_names):
            wa_order = pd.read_csv(f,
                                   header=0,
                                   lineterminator='\n',
                                   parse_dates=['time'],
                                   date_parser=lambda dates: pd.datetime.
                                   strptime(dates, '%Y-%m-%d %H:%M:%S'))
            self.wa_matrices.append(wa_order.as_matrix())

    def preprocess_pystarma_model_with_source_maker(self, source_maker):
        source = source_maker.all_data
        self._preprocessing(self.ts_matrix.as_matrix(), self.wa_matrices,
                            self.max_t_lag, self.sample_size)

        # make time serie stationar
        diff = (1, 288)
        self.ts_matrix = np.log1p(self.ts_matrix)
        self.ts_matrix = set_stationary(self.ts_matrix, diff)

        # re-run preprocessing
        self._preprocessing(self.ts_matrix.as_matrix(), self.wa_matrices,
                            self.max_t_lag, self.sample_size)

        self._model_identification(self.ts_matrix,
                                   self.wa_matrices,
                                   max_lag=25)

    def _preprocessing(self,
                       ts_matrix,
                       wa_matrices=0,
                       max_lag=25,
                       sample_size=0.1):
        stationary_test = self._preprocess(ts_matrix, max_lag, sample_size)

        adf_test = stationary_test['test_stat']
        print('Test Statistic:\t%s' % adf_test['Test_Statistic'])
        print('p-Value:\t%s' % adf_test['p-value'])
        print('#Lags_Used:\t%s' % adf_test['#Lags_Used'])
        print('Num_of_Obs:\t%s' % adf_test['Num_of_Obs'])
        print('Crit_Value_1:\t%s' % adf_test['Crit_Value_1%'])
        print('Crit_Value_5:\t%s' % adf_test['Crit_Value_5%'])
        print('Crit_Value_10:\t%s' % adf_test['Crit_Value_10%'])

        plt.subplot(121)
        plt.title('Autocorrelation Function')
        self._plot_acf(stationary_test['acf_mean'], max_lag, len(ts_matrix))

        plt.subplot(122)
        plt.title('Partial Autocorrelation Function')
        self._plot_acf(stationary_test['pacf_mean'], max_lag, len(ts_matrix))
        plt.show()

        if wa_matrices != 0:
            cv = []
            cvm = []
            for tlag in range(max_lag):
                cv.append(1.96 / np.sqrt(len(ts_matrix) - tlag))
                cvm.append(-1.96 / np.sqrt(len(ts_matrix) - tlag))

            stacf = Stacf(ts_matrix, wa_matrices, max_lag).estimate()
            self._plot_stacf(stacf, max_lag, len(ts_matrix))
            plt.title('Space-Time Autocorrelation Function')
            plt.show()

    def _preprocess(self, ts_matrix, max_lags=10, sample_size=0.1):
        factor = int(len(ts_matrix.T) * (sample_size))
        acf_matrix = np.zeros((max_lags + 1, factor))
        pacf_matrix = np.zeros((max_lags + 1, factor))

        test_stat = {
            'Test_Statistic': 0.,
            'p-value': 0.,
            '#Lags_Used': 0.,
            'Num_of_Obs': 0.,
            'Crit_Value_1%': 0.,
            'Crit_Value_5%': 0.,
            'Crit_Value_10%': 0.
        }

        for i in range(0, len(ts_matrix.T), (len(ts_matrix.T) + 1) / factor):
            timeserie = ts_matrix[:, i]
            acf_matrix[:, i / factor], pacf_matrix[:, i /
                                                   factor] = estimate_acf_pacf(
                                                       timeserie, max_lags)
            dfoutput = test_stationarity(timeserie, max_lags)
            test_stat['Test_Statistic'] += dfoutput['Test_Statistic'] * (
                1. / factor)
            test_stat['p-value'] += dfoutput['p-value'] * (1. / factor)
            test_stat['#Lags_Used'] += dfoutput['#Lags_Used'] * (1. / factor)
            test_stat['Num_of_Obs'] += dfoutput['Num_of_Obs'] * (1. / factor)
            test_stat['Crit_Value_1%'] += dfoutput['Crit_Value_1%'] * (1. /
                                                                       factor)
            test_stat['Crit_Value_5%'] += dfoutput['Crit_Value_5%'] * (1. /
                                                                       factor)
            test_stat['Crit_Value_10%'] += dfoutput['Crit_Value_10%'] * (
                1. / factor)

        acf_std = acf_matrix.std(1)
        acf_mean = acf_matrix.mean(1)
        pacf_std = pacf_matrix.std(1)
        pacf_mean = pacf_matrix.mean(1)

        return {
            "test_stat": test_stat,
            'acf_std': acf_std,
            'acf_mean': acf_mean,
            'pacf_std': pacf_std,
            'pacf_mean': pacf_mean
        }

    def _model_identification(self, ts_matrix, wa_matrices, max_lag=25):
        # Get critcal value for model identification
        cv = []
        cvm = []
        for tlag in range(max_lag):
            cv.append(1.96 / np.sqrt(len(ts_matrix) - tlag))
            cvm.append(-1.96 / np.sqrt(len(ts_matrix) - tlag))

        stacf = Stacf(ts_matrix, wa_matrices, max_lag).estimate()
        plt.subplot(211)
        self.__plot_stacf(stacf, max_lag, len(ts_matrix))
        plt.title('Space-Time Autocorrelation Function')

        stpacf = Stpacf(ts_matrix, wa_matrices, max_lag).estimate()
        plt.subplot(212)
        self._plot_stacf(stpacf, max_lag, len(ts_matrix))
        plt.title('Space-Time Partial Autocorrelation Function')
        plt.show()

    def _plot_acf(self, acf, max_lag, ts_len):
        cv = []
        cvm = []
        for lag in range(max_lag):
            cv.append(1.96 / np.sqrt(ts_len - lag))
            cvm.append(-1.96 / np.sqrt(ts_len - lag))

        plt.plot(acf)
        plt.plot(cv, linestyle='--', color='red')
        plt.plot(cvm, linestyle='--', color='red')
        plt.axhline(y=0, linestyle='--', color='gray')
        pass

    def _plot_stacf(self, stacf, max_lag, ts_len):
        cv = []
        cvm = []
        for lag in range(max_lag):
            cv.append(1.96 / np.sqrt(ts_len - lag))
            cvm.append(-1.96 / np.sqrt(ts_len - lag))

        for i, ts in enumerate(stacf.T):
            plt.plot(ts, label=str(i) + '. Ordnung')

        plt.plot(stacf)
        plt.plot(cv, linestyle='--', color='red')
        plt.plot(cvm, linestyle='--', color='red')
        plt.axhline(y=0, linestyle='--', color='gray')
        plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        pass

    def create_pystarma_model_from_source_maker(self, source_maker):
        source = source_maker.all_data
        self.pystarma_model = self._create_pystarma_model(
            self.ts_matrix, self.wa_matrices, self.ar, self.ma, self.lags,
            self.iterations)

    def _create_pystarma_model(self,
                               ts_matrix,
                               wa_matrices,
                               ar=0,
                               ma=0,
                               lags='',
                               iterations=2):
        if lags == '':
            pystarma_model = STARMA(ar, ma, ts_matrix.copy(), wa_matrices,
                                    iterations)
        else:
            pystarma_model = STARIMA(ar, ma, lags, ts_matrix.copy(),
                                     wa_matrices, iterations)
        return pystarma_model

    def fit_model(self):
        self.pystarma_model.fit()

        def _extract_sensors_currently_being_used_as_output_from_source_data(
                self):
            self.subset_starma_array = self._ts_matrix.df[
                self.input_target_maker.get_target_sensor_idxs_list()].values

        def _target_time_offset_at_index(self, index):
            return self.input_target_maker.get_target_time_offsets_list(
            )[index]

        def _max_input_time_offset(self):
            return max(self.input_target_maker.get_input_time_offsets_list())

    def make_prediction_dataset_object(self):
        size = [
            self.num_target_rows,
            self.num_target_offsets * self.num_target_sensors
        ]
        array = np.zeros(size)
        self._extract_sensors_currently_being_used_as_output_from_source_data()
        self._iterate_over_target_time_offsets_replicating_pystarma_model(
            array)
        self.prediction_dataset_object.set_numpy_array(array)
        return self.prediction_dataset_object
 def read_data_from_csv(self):
     self.all_data = Dataset()
     self.all_data.read_csv(self.file_path_all_data, index_col=0)
Exemplo n.º 10
0
	def set_average_data_from_csv_file_path(self,file_path):
		self.source_average_dataset_object = Dataset()
		self.source_average_dataset_object.read_csv(file_path,index_col=0)
Exemplo n.º 11
0
class Average_Week_Content(Model_Content):
	
	def __init__(self):
		super(Average_Week_Content,self).__init__()
		self.source_average_dataset_object = Dataset() # contains all sensors
		self.prediction_dataset_object = Dataset()
		self.subset_avg_week_array = None # subset of sensors specified by target_sensor_idxs_list
		self.target_begin_weekday_int = None
		self.num_target_rows = None
		self.num_target_sensors = None
		self.num_target_offsets = None
		self.input_target_maker = None
		self.weekday_int_source = None

	def set_input_target_maker(self, input_target_maker, test = False):
		self.input_target_maker = input_target_maker
		self.num_target_sensors = len(input_target_maker.get_target_sensor_idxs_list())
		#print("NUMBER TARGET SENSORS IS %d"%self.num_target_sensors)
		self.num_target_offsets = len(input_target_maker.get_target_time_offsets_list())
		self._set_parameters_from_input_target_maker() if test is False else logging.debug("Testing")

	def _set_parameters_from_input_target_maker(self):
		self.set_source_weekday_begin_int()
		self.num_target_rows = self.input_target_maker.get_target_dataset_object().get_number_rows()

	def set_source_weekday_begin_int(self):
		first_day_timestamp_string = self.input_target_maker.target_maker.dataset_object.df.index.values[0]
		self.target_begin_weekday_int = get_weekday_int_from_timestamp_string_with_format(first_day_timestamp_string,self.input_target_maker.time_format) 
		self.target_begin_time_int = get_time_int_from_timestamp_string_with_format(first_day_timestamp_string,self.input_target_maker.time_format, time_interval_in_minutes = TIME_INTERVAL_IN_MINUTES)

	def set_average_data_from_csv_file_path(self,file_path):
		self.source_average_dataset_object = Dataset()
		self.source_average_dataset_object.read_csv(file_path,index_col=0)

	def create_average_week_with_source_maker(self, source_maker):
		source = source_maker.all_data 
		idx = self._get_idx_first_midnight(source_maker)
		average_data = calculate_average_week_from_numpy_array(source.df.iloc[idx:source.df.shape[0],:].values)
		current_weekday_start_int = get_weekday_int_from_timestamp_string_with_format(source.df.index.values[0], source_maker.time_format_train) if self.weekday_int_source is None else self.weekday_int_source
		average_data = rearrange_week_starting_to_start_on_monday_with_current_day_start_int(average_data, current_weekday_start_int)
		self.source_average_dataset_object.set_numpy_array(average_data)
		self.source_average_dataset_object.set_row_names(make_week_starting_on_monday_timestamps(weekday_begin_int = 0,time_interval_in_seconds=TIME_INTERVAL_IN_SECONDS))
		self.source_average_dataset_object.set_column_names(source.get_column_names())

	def _get_idx_first_midnight(self,source_maker):
		idx = 0
		the_datetime = convert_string_to_datetime(source_maker.all_data.df.index.values[idx],source_maker.time_format_train)
		while(the_datetime.hour!=0):
			idx = idx + 1
			the_datetime = convert_string_to_datetime(source_maker.all_data.df.index.values[idx],source_maker.time_format_train)
		return 0

	def make_prediction_dataset_object(self):
		size = [self.num_target_rows , self.num_target_offsets*self.num_target_sensors]
		array = np.zeros(size)
		self._extract_sensors_currently_being_used_as_output_from_source_data()
		self._iterate_over_target_time_offsets_replicating_average_week(array)
		self.prediction_dataset_object.set_numpy_array(array)
		return self.prediction_dataset_object

	def _extract_sensors_currently_being_used_as_output_from_source_data(self):
		self.subset_avg_week_array = self.source_average_dataset_object.df[self.input_target_maker.get_target_sensor_idxs_list()].values

	def _iterate_over_target_time_offsets_replicating_average_week(self,array):
		'''
			For each time offset:
				rearrange avg week to start at desired time offset
				copy the average week x number of times (until target # rows filled) 
		'''
		for i in range(self.num_target_offsets):
			cur_time_offset = self._calculate_time_offset_in_min_relative_to_input_time_offsets(i)
			avg_week_starting_at_time = rearrange_week_to_start_at_time(self.subset_avg_week_array, cur_time_offset)
			self._fill_prediction_with_copies_of_avg_week_at_timeoffset_index(avg_week_starting_at_time, i, array)
	
	def _calculate_time_offset_in_min_relative_to_input_time_offsets(self,idx_current_time_offset):
		'''
			All time offsets are all relative to time 0 of the input time offset. 
			So if input time offset is [0,5,10] and target time offset is [5,10,20] the target time offsets are actually [15,20,30]
			Average Weeks are standardized to start on mondays, and then rearranged to fit the target beginning day
		'''
		current_time_offset = self._target_time_offset_at_index(idx_current_time_offset)
		weekday_begin_in_minutes = self.target_begin_weekday_int * LEN_DAY + self.target_begin_time_int
		return current_time_offset + weekday_begin_in_minutes + self._max_input_time_offset()

	def _fill_prediction_with_copies_of_avg_week_at_timeoffset_index(self, avg_week_starting_at_time, idx_current_time_offset,array):
		'''
			Iterate down the rows of the prediction array, filling in the current time_offset columns
			with copies of the avg week (adjusted to start at the time offset)
		'''
		s_y = self.num_target_sensors*idx_current_time_offset # fill only columns associated with current time offset
		e_y = s_y + self.num_target_sensors
		n_copies_avg_week_to_make = int(self.num_target_rows/LEN_WEEK)
		for i in range(0,n_copies_avg_week_to_make):
			array[i*LEN_WEEK:(i+1)*LEN_WEEK,s_y:e_y] = avg_week_starting_at_time
		# fill last, not full week row by row :
		i = 0
		ind = n_copies_avg_week_to_make*LEN_WEEK
		while(ind<self.num_target_rows):
			array[ind,s_y:e_y] = avg_week_starting_at_time[i,:]
			ind = ind + 1
			i = i + 1

	def _target_time_offset_at_index(self, index):
		return self.input_target_maker.get_target_time_offsets_list()[index]
	
	def _max_input_time_offset(self):
		return max(self.input_target_maker.get_input_time_offsets_list())
Exemplo n.º 12
0
 def __init__(self):
     self.prediction_dataset_object = Dataset()
     self.target_dataset_object = Dataset()
Exemplo n.º 13
0
 def create_df_subset_with_idxs(self, row_range):
     d = Dataset()
     d.df = pd.DataFrame(self.all_data.df.iloc[row_range, :])
     return d