def _fill_remaining_gaps(data: InputData, operation_type: str): """ Function for filling in the nans in the table with features """ # TODO discuss: move this "filling" to the chain method - we use such method too much here (for all tables) # np.isnan(features).any() and np.isnan(features) doesn't work with non-numeric arrays features = data.features is_operation_not_for_text = operation_type != 'text_clean' if data.data_type == DataTypesEnum.table and is_operation_not_for_text: # Got indices of columns with string objects categorical_ids, _ = OneHotEncodingImplementation.str_columns_check( features) # Apply most_frequent or mean filling strategy if len(categorical_ids) == 0: data.features = SimpleImputer().fit_transform(features) else: data.features = SimpleImputer( strategy='most_frequent').fit_transform(features) return data
def _fill_remaining_gaps(data: InputData, operation_type: str): """ Function for filling in the nans in the table with features """ # TODO discuss: move this "filling" to the pipeline method - we use such method too much here (for all tables) # np.isnan(features).any() and np.isnan(features) doesn't work with non-numeric arrays features = data.features if data.data_type == DataTypesEnum.table and data.task.task_type != TaskTypesEnum.ts_forecasting: # Got indices of columns with string objects categorical_ids, _ = OneHotEncodingImplementation.str_columns_check( features) # Apply most_frequent or mean filling strategy if len(categorical_ids) == 0: data.features = ImputationImplementation().fit_transform( data).predict else: data.features = ImputationImplementation( strategy='most_frequent').fit_transform(data).predict return data
def _preprocess(self, data: InputData): preprocessing_func = preprocessing_func_for_data(data, self) if not self.cache.actual_cached_state: # if fitted preprocessor not found in cache preprocessing_strategy = \ preprocessing_func().fit(data.features) else: # if fitted preprocessor already exists preprocessing_strategy = self.cache.actual_cached_state.preprocessor data.features = preprocessing_strategy.apply(data.features) return data, preprocessing_strategy
def _prepare_exog_features(data_for_prediction: InputData, exog_data: InputData, last_prediction: np.array, forecast_step: int, forecast_length: int) -> InputData: new_features = [] if len(data_for_prediction.features.shape) == 1: # if one exog feature exog_features_num = 1 else: # if several exog features exog_features_num = data_for_prediction.features.shape[1] new_part_len = 0 if exog_features_num > 1: for exog_feat_id in range(exog_features_num): exog_feature = data_for_prediction.features[:, exog_feat_id] new_exog_values = \ exog_data.features[forecast_step * forecast_length: ((forecast_step + 1) * forecast_length), exog_feat_id] new_feature = np.append(exog_feature, new_exog_values) new_features.append(new_feature) new_part_len = len(new_features[0]) else: exog_feature = data_for_prediction.features new_exog_values = \ exog_data.features[forecast_step * forecast_length: ((forecast_step + 1) * forecast_length)] new_features = np.append(exog_feature, new_exog_values) new_part_len = len(new_features) # add predicted time series to features for next prediction predicted_ts = np.append(data_for_prediction.target, last_prediction) # cut the prediction if it's too long (actual for the last forecast step) predicted_ts = predicted_ts[0:new_part_len] # new_features.append(predicted_ts) data_for_prediction.target = predicted_ts data_for_prediction.features = np.stack(np.asarray(new_features)).T return data_for_prediction