def load_datasets(config: configparser.ConfigParser, target_column: str) -> list: """ Load datasets according to info specified in config file :param config: config with dataset specific info :param target_column: target_column for prediction :return: list of datasets to use for optimization """ datasets_lst = list() # load and name raw dataset dataset_raw = pd.read_csv(config['General']['base_dir'] + 'Data/' + config[target_column]['dataset_raw'] + '.csv', sep=';', decimal=',', index_col=0) try: dataset_raw.index = pd.to_datetime(dataset_raw.index, format='%d.%m.%Y') except: dataset_raw.index = pd.to_datetime(dataset_raw.index, format='%Y-%m-%d') dataset_raw = dataset_raw.asfreq('D') dataset_raw.name = config[target_column]['dataset_raw'] datasets_lst.append(dataset_raw) # split dataset at before_break_date if 'before_break_date' in config[target_column]: dataset_before_break = dataset_raw.copy() dataset_before_break.name = dataset_raw.name + '_before_break' before_break_date = datetime.datetime.strptime(config[target_column]['before_break_date'], '%Y-%m-%d').date() PreparationHelper.drop_rows_by_dates(df=dataset_before_break, start=before_break_date, end=dataset_before_break.index[-1]) datasets_lst.append(dataset_before_break) return datasets_lst
def insample(self, train: pd.DataFrame) -> pd.DataFrame: """ Deliver (back-transformed) insample predictions :param train: train set :return: DataFrame with insample predictions """ train_exog = None if self.use_exog: train_exog = train.drop(labels=[self.target_column], axis=1) PreparationHelper.drop_columns(train_exog, self.exog_cols_dropped) train_exog = train_exog.to_numpy(dtype=float) insample = pd.DataFrame( data=self.model.predict_in_sample(exogenous=train_exog), index=train.index, columns=['Insample']) if self.power_transformer is not None: insample = pd.DataFrame( data=self.power_transformer.inverse_transform( insample['Insample'].values.reshape(-1, 1)), index=insample.index, columns=['Insample']) if self.log: if 0 in train[self.target_column].values: self.contains_zeros = True insample = np.exp(insample) - 1 else: insample = np.exp(insample) return insample
def train(self, train: pd.DataFrame, cross_val_call: bool = False) -> dict: """ Train (S)ARIMA(X) model :param train: train set :param cross_val_call: called to perform cross validation :return dictionary with cross validated scores (if specified) """ cross_val_score_dict = {} if cross_val_call: cross_val_score_dict, self.model = self.get_cross_val_score( train=train) train_exog = None if (self.power_transformer is not None) or self.log: train = TrainHelper.get_transformed_set( dataset=train, target_column=self.target_column, power_transformer=self.power_transformer, log=self.log) if self.use_exog: train_exog = train.drop(labels=[self.target_column], axis=1) self.exog_cols_dropped = train_exog.columns[ train_exog.isna().any()].tolist() PreparationHelper.drop_columns(train_exog, self.exog_cols_dropped) train_exog = train_exog.to_numpy(dtype=float) self.model.fit(y=train[self.target_column], exogenous=train_exog, trend=self.trend) return cross_val_score_dict
def add_public_holiday_counters(dataset: pd.DataFrame, event_lags: list, special_days: list): """ Function adding counters for upcoming or past public holidays (according to event_lags) with own counters for those specified in special_days :param dataset: dataset for adding features :param event_lags: lags before and after holiday to add :param special_days: list of days with their own counter as feature """ for index, row in dataset.iterrows(): holiday = row['public_holiday'] if holiday != 'no': for lag in event_lags: if (index + pd.Timedelta(days=lag)) in dataset.index: dataset.at[index + pd.Timedelta(days=lag), 'cal_PublicHoliday_Counter'] = -lag if holiday in special_days: dataset.at[index + pd.Timedelta(days=lag), 'cal_' + holiday + '_Counter'] = -lag PreparationHelper.drop_columns(df=dataset, columns=['public_holiday']) dataset[[col for col in dataset.columns if 'Counter' in col]] = \ dataset[[col for col in dataset.columns if 'Counter' in col]].fillna(value=99)
def get_ready_train_test_lst(dataset: pd.DataFrame, config: configparser.ConfigParser, init_train_len: int, test_len: int, split_perc: float, imputation: str, target_column: str, reset_index: bool = False, dimensionality_reduction: str = None, featureset: str = 'full') -> list: """ Function preparing train and test sets for training based on raw dataset: - Missing Value imputation - Feature Extraction (- Resampling if specified) - Deletion of non-target sales columns - Split into train and test set(s) :param dataset: dataset with raw samples :param config: config with dataset specific info :param init_train_len: length of first train set :param test_len: usual length of test set (could be shorter for last test set) :param split_perc: percentage of samples to use for train set :param imputation: imputation method to use :param target_column: target_column used for predictions :param reset_index: reset_index of dataset (relevant for Exponential Smoothing) :param dimensionality_reduction: perform dimensionality reduction :param featureset: featureset to use ('full', 'cal', 'stat', 'none') :return: list with train and test set(s) """ print('##### Preparing Train and Test Sets #####') # get dataset specific parameters seasonal_periods = config[target_column].getint('seasonal_periods') features_for_stats = config[target_column]['features_for_stats'].replace(" ", "").split(',') resample_weekly = config[target_column].getboolean('resample_weekly') possible_target_cols = config[target_column]['possible_target_cols'].replace(" ", "").split(',') cols_to_condense, condensed_col_name = None, None # use stat and cal features according to specified featureset stat_features = True cal_features = True if featureset == 'none': stat_features = False cal_features = False elif featureset == 'cal': stat_features = False elif featureset == 'stat': cal_features = False if 'cols_to_condense' in config[target_column]: cols_to_condense = config[target_column]['cols_to_condense'].replace(" ", "").split(',') condensed_col_name = config[target_column]['condensed_col_name'] # load train and test set with missing values train_test_list_mv = get_train_test_lst(dataset=dataset, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc) train_test_list = list() counter_list_tuple = 0 for train_mv, test_mv in train_test_list_mv: # impute dataset according to fitting on train set with missing values if imputation is not None: dataset_imputed, _, _ = impute_dataset_train_test(imputation=imputation, dataset=dataset, train=train_mv, test=test_mv) else: dataset_imputed = dataset.copy() MixedHelper.set_dtypes(df=dataset_imputed, cols_to_str=['public_holiday', 'school_holiday']) # feature extraction on imputed dataset if resample_weekly: # stats features after resampling, if resampling is done, to avoid information leak due to resampling FeatureAdder.add_features(dataset=dataset_imputed, cols_to_condense=cols_to_condense, condensed_col_name=condensed_col_name, use_stat_features=False, use_calendar_features=cal_features) else: FeatureAdder.add_features(dataset=dataset_imputed, cols_to_condense=cols_to_condense, condensed_col_name=condensed_col_name, seasonal_periods=seasonal_periods, features_for_stats=features_for_stats, use_stat_features=stat_features, use_calendar_features=cal_features) dataset_feat = PreparationHelper.get_one_hot_encoded_df( df=dataset_imputed, columns_to_encode=list(set(dataset_imputed.columns).intersection(['public_holiday', 'school_holiday']))) dataset_feat.dropna(subset=[target_column], inplace=True) # resample if specified if resample_weekly: dataset_feat = dataset_feat.resample('W').apply( lambda x: PreparationHelper.custom_resampler(arraylike=x, summation_cols=possible_target_cols) ) if 'cal_date_weekday' in dataset_feat.columns: PreparationHelper.drop_columns(df=dataset_feat, columns=['cal_date_weekday']) # drop rows added due to resampling of quarter dataset dataset_feat.dropna(inplace=True) init_train_len = int(train_test_list_mv[0][0].shape[0] / 7) test_len = int(train_test_list_mv[0][1].shape[0] / 7) seasonal_periods = int(seasonal_periods / 7) if stat_features: FeatureAdder.add_features(dataset=dataset_feat, seasonal_periods=seasonal_periods, features_for_stats=features_for_stats, use_calendar_features=False, with_weekday_stats=False, lags=[1, 4], windowsize_rolling=4, windowsize_rolling_seas=4) StatisticalFeatures.add_rolling_statistics_features(dataset=dataset_feat, windowsize=2, features=features_for_stats) StatisticalFeatures.add_rolling_seasonal_statistics_features(dataset=dataset_feat, windowsize=2, features=features_for_stats, seasonal_periods=seasonal_periods) # drop non-target columns cols_to_drop = possible_target_cols.copy() cols_to_drop.remove(target_column) PreparationHelper.drop_columns(df=dataset_feat, columns=cols_to_drop) # split into train and test set(s) if reset_index: dataset_feat.reset_index(drop=True, inplace=True) train_test_list_feat = get_train_test_lst(dataset=dataset_feat, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc) # impute missing values after adding statistical features (e.g. due to lagged features) if imputation is not None: _, train_feat_imp, test_feat_imp = impute_dataset_train_test( imputation=imputation, train=train_test_list_feat[counter_list_tuple][0], test=train_test_list_feat[counter_list_tuple][1]) else: train_feat_imp = train_test_list_feat[counter_list_tuple][0] test_feat_imp = train_test_list_feat[counter_list_tuple][1] # perform dimensionality reduction if specified if not train_feat_imp.isna().any().any() and dimensionality_reduction == 'pca': train_feat_imp, test_feat_imp = pca_transform_train_test(train=train_feat_imp, test=test_feat_imp, target_column=target_column) train_test_list.append((train_feat_imp, test_feat_imp)) if len(train_test_list_feat) > 1: # special treatment for time series split with multiple train test pairs in train_test_list # first iteration of for loop: imputation based on train_1, second iteration: imputation based on train_2 # in both cases creation of multiple (train, test) pairs based on imputed dataset # only the one related to the set used for imputation shall be kept counter_list_tuple += 1 print('##### Prepared Train and Test Sets #####') return train_test_list
def load_datasets(config: configparser.ConfigParser, company: str, target_column: str) -> list: """ Load datasets according to info specified in config file :param config: config with dataset specific info :param company: name of the company related to the dataset :param target_column: target_column for prediction :return: list of datasets to use for optimization """ datasets_lst = list() # load and name raw dataset try: dataset_raw = \ pd.read_csv(config['General']['base_dir'] + 'Data/' + config[target_column]['dataset_raw'] + '.csv', sep=';', decimal=',', index_col=0) except: dataset_raw = \ pd.read_csv(config['General']['base_dir'] + 'Data/' + config[target_column]['dataset_raw'] + '.csv', sep=';', decimal='.', index_col=0) if type(dataset_raw.index[0]) == str: if '.' in dataset_raw.index[0]: dataset_raw.index = pd.to_datetime(dataset_raw.index, format='%d.%m.%Y') elif '-' in dataset_raw.index[0]: dataset_raw.index = pd.to_datetime(dataset_raw.index, format='%Y-%m-%d') # drop columns from raw dataset if not needed if 'raw_cols_to_drop' in config[target_column]: PreparationHelper.drop_columns( df=dataset_raw, columns=config[target_column]['raw_cols_to_drop'].replace( " ", "").split(',')) PreparationHelper.drop_columns( df=dataset_raw, columns=[col for col in dataset_raw.columns if 'Unnamed' in col]) # drop samples after start_date_to_drop if target_column is not recorded for whole dataset if 'start_date_to_drop' in config[target_column]: start_date_to_drop = datetime.datetime.strptime( config[target_column]['start_date_to_drop'], '%Y-%m-%d').date() PreparationHelper.drop_rows_by_dates(df=dataset_raw, start=start_date_to_drop, end=dataset_raw.index[-1]) if target_column in ['milk', 'beer', 'usdeaths']: dataset_raw = dataset_raw.apply(lambda x: x.str.replace( ',', '.').astype(float) if x.dtype == object else x) elif target_column == 'maunaloa_monthly': dataset_raw = dataset_raw.resample('M').apply( lambda x: PreparationHelper.custom_resampler(arraylike=x, summation_cols=[])) elif target_column == 'VisitorNights': dataset_raw = dataset_raw.apply(lambda x: x.str.replace( ',', '.').astype(float) if x.dtype == object else x) dataset_raw.name = company + config[target_column]['dataset_raw'] datasets_lst.append(dataset_raw) # split dataset at before_break_date if 'before_break_date' in config[target_column]: dataset_before_break = dataset_raw.copy() dataset_before_break.name = dataset_raw.name + '_before_break' before_break_date = datetime.datetime.strptime( config[target_column]['before_break_date'], '%Y-%m-%d').date() PreparationHelper.drop_rows_by_dates( df=dataset_before_break, start=before_break_date, end=dataset_before_break.index[-1]) datasets_lst.append(dataset_before_break) return datasets_lst
def predict(self, test: pd.DataFrame, train: pd.DataFrame) -> pd.DataFrame: """ Deliver (back-transformed), if specified one step ahead, out-of-sample predictions :param test: test set :param train: train set :return: DataFrame with predictions, upper and lower confidence level """ test_exog = None if (self.power_transformer is not None) or self.log: test = TrainHelper.get_transformed_set( dataset=test, target_column=self.target_column, power_transformer=self.power_transformer, log=self.log, only_transform=True) if self.use_exog: test_exog = test.drop(labels=[self.target_column], axis=1) PreparationHelper.drop_columns(test_exog, self.exog_cols_dropped) test_exog = test_exog.to_numpy(dtype=float) if self.one_step_ahead: predict = [] conf_low = [] conf_up = [] # deep copy model as predict function should not change class model model = copy.deepcopy(self.model) for i in range(0, test.shape[0]): if self.use_exog: fc, conf = model.predict(n_periods=1, exogenous=pd.DataFrame( test_exog[i].reshape(1, -1)), return_conf_int=True, alpha=0.05) model.update(test[self.target_column][i], exogenous=pd.DataFrame(test_exog[i].reshape( 1, -1))) else: fc, conf = model.predict(n_periods=1, return_conf_int=True, alpha=0.05) model.update(test[self.target_column][i]) predict.append(fc[0]) conf_low.append(conf[0][0]) conf_up.append(conf[0][1]) else: predict, conf = self.model.predict(n_periods=test.shape[0], exogenous=test_exog, return_conf_int=True, alpha=0.05) conf_low = conf[:, 0] conf_up = conf[:, 1] predictions = pd.DataFrame( { 'Prediction': predict, 'LowerConf': conf_low, 'UpperConf': conf_up }, index=test.index) if self.power_transformer is not None: predictions = pd.DataFrame( { 'Prediction': self.power_transformer.inverse_transform( predictions['Prediction'].values.reshape(-1, 1)).flatten(), 'LowerConf': self.power_transformer.inverse_transform( predictions['LowerConf'].values.reshape(-1, 1)).flatten(), 'UpperConf': self.power_transformer.inverse_transform( predictions['UpperConf'].values.reshape(-1, 1)).flatten() }, index=predictions.index) if self.log: predict_backtr = np.exp(predictions['Prediction']) if self.contains_zeros: predict_backtr += 1 lower_dist = ( (predictions['Prediction'] - predictions['LowerConf']) / predictions['Prediction']) * predict_backtr upper_dist = ( (predictions['UpperConf'] - predictions['Prediction']) / predictions['Prediction']) * predict_backtr predictions = pd.DataFrame( { 'Prediction': predict_backtr, 'LowerConf': predict_backtr - lower_dist, 'UpperConf': predict_backtr + upper_dist }, index=predictions.index) return predictions
def add_features(dataset: pd.DataFrame, cols_to_condense: list = None, condensed_col_name: str = None, seasonal_periods: int = 0, features_for_stats: list = None, use_calendar_features: bool = True, use_stat_features: bool = True, event_lags: list = None, special_days: list = None, lags: list = None, windowsize_rolling: int = 7, windowsize_rolling_seas: int = 7, windowsize_rolling_weekday: int = 4, with_weekday_stats: bool = True): """ Function adding all specified features to dataset :param dataset: dataset used for adding features :param cols_to_condense: cols which should be condensed to one column :param condensed_col_name: name of condensed column :param seasonal_periods: seasonality used for seasonal-based features :param features_for_stats: features used for calculating statistical features :param use_calendar_features: specify if calendar features should be added :param use_stat_features: specify if statistical features should be added :param event_lags: lags for event counter features :param special_days: days with their own event counter :param lags: lags to use for lagged sales numbers :param windowsize_rolling: windowsize used for rolling statistics :param windowsize_rolling_seas: windowsize used for rolling seasonal statistics :param windowsize_rolling_weekday: windowsize used for rolling statistics for each weekday :param with_weekday_stats: specify if weekday specific stats should be added """ if event_lags is None: event_lags = [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3] if special_days is None: special_days = ['Valentine', 'MothersDay', 'Karfreitag'] if lags is None: lags = [1, 2, 3, 4, 5, 6, 7] print('---Starting to add features---') if cols_to_condense is not None and condensed_col_name is not None: dataset[condensed_col_name] = 0 for col in cols_to_condense: dataset[condensed_col_name] += dataset[col] PreparationHelper.drop_columns(df=dataset, columns=cols_to_condense) if use_calendar_features: print('---Adding calendar features---') add_calendar_features(dataset=dataset, event_lags=event_lags, special_days=special_days) if use_stat_features: print('---Adding statistical features---') add_statistical_features( dataset=dataset, seasonal_periods=seasonal_periods, features_for_stats=features_for_stats, lags=lags, windowsize_rolling=windowsize_rolling, windowsize_rolling_seas=windowsize_rolling_seas, windowsize_rolling_weekday=windowsize_rolling_weekday, with_weekday_stats=with_weekday_stats) print('---Features added---')