def _fit_async(X_path, grp_hash, tmp_folder): """ Fits a FB Prophet model for a particular time group :param X_path: Path to the data used to fit the FB Prophet model :param grp_hash: Time group identifier :return: time group identifier and path to the pickled model """ np.random.seed(1234) random.seed(1234) X = load_obj(X_path) # Commented for performance, uncomment for debug # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if X.shape[0] < 20: # print("prophet - small data work-around for group: %s" % grp_hash) return grp_hash, None # Import FB Prophet package mod = importlib.import_module('fbprophet') Prophet = getattr(mod, "Prophet") model = Prophet() with suppress_stdout_stderr(): model.fit(X[['ds', 'y']]) model_path = os.path.join(tmp_folder, "fbprophet_model" + str(uuid.uuid4())) save_obj(model, model_path) remove(X_path) # remove to indicate success return grp_hash, model_path
def _fit_async(X_path, grp_hash, time_column, tmp_folder): """ Fits an ARIMA model for a particular time group :param X_path: Path to the data used to fit the ARIMA model :param grp_hash: Time group identifier :param time_column: Name of the time column in the input data :return: time group identifier and path to the pickled model """ np.random.seed(1234) random.seed(1234) X = load_obj(X_path) pm = importlib.import_module('pmdarima') with suppress_stdout_stderr(): try: order = order = np.argsort(X[time_column]) model = pm.auto_arima(X['y'].values[order], error_action='ignore') except: model = None model_path = os.path.join(tmp_folder, "autoarima_model" + str(uuid.uuid4())) save_obj(model, model_path) remove(X_path) # remove to indicate success return grp_hash, model_path
def _transform_async(model_path, X_path, nan_value, has_is_train_attr, time_column): model = load_obj(model_path) XX_path = os.path.join(temporary_files_path, "autoarima_XXt" + str(uuid.uuid4())) X = load_obj(X_path) # Facebook Prophet returns the predictions ordered by time # So we should keep track of the time order for each group so that # predictions are ordered the same as the imput frame # Keep track of the order order = np.argsort(X[time_column]) if model is not None: yhat = model.predict_in_sample() \ if has_is_train_attr else model.predict(n_periods=X.shape[0]) yhat = yhat[order] XX = pd.DataFrame(yhat, columns=['yhat']) else: XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat']) # invalid model # Sync index XX.index = X.index assert XX.shape[1] == 1 save_obj(XX, XX_path) remove(model_path) # indicates success, no longer need remove(X_path) # indicates success, no longer need return XX_path
def _transform_async(model_path, X_path, nan_value, tmp_folder): """ Predicts target for a particular time group :param model_path: path to the stored model :param X_path: Path to the data used to fit the FB Prophet model :param nan_value: Value of target prior, used when no fitted model has been found :return: self """ model = load_obj(model_path) XX_path = os.path.join(tmp_folder, "fbprophet_XX" + str(uuid.uuid4())) X = load_obj(X_path) # Facebook Prophet returns the predictions ordered by time # So we should keep track of the time order for each group so that # predictions are ordered the same as the imput frame # Keep track of the order order = np.argsort(pd.to_datetime(X["ds"])) if model is not None: # Run prophet yhat = model.predict(X)['yhat'].values XX = pd.DataFrame(yhat, columns=['yhat']) else: XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat']) # invalid models XX.index = X.index[order] assert XX.shape[1] == 1 save_obj(XX, XX_path) remove(model_path) # indicates success, no longer need remove(X_path) # indicates success, no longer need return XX_path
def _transform_async(model_path, X_path, nan_value, tmp_folder): """ Predicts target for a particular time group :param model_path: path to the stored model :param X_path: Path to the data used to fit the FB Prophet model :param nan_value: Value of target prior, used when no fitted model has been found :return: self """ model = load_obj(model_path) XX_path = os.path.join(tmp_folder, "fbprophet_XX" + str(uuid.uuid4())) X = load_obj(X_path) X_time = X[['ds']].groupby('ds').first().reset_index() with suppress_stdout_stderr(): y_avg = model.predict(X_time)[['ds', 'yhat']] # Prophet transforms the date column to datetime so we need to transfrom that to merge back X_time.sort_values('ds', inplace=True) X_time['yhat'] = y_avg['yhat'] X_time.sort_index(inplace=True) # Merge back into original frame on 'ds' # pd.merge wipes the index ... so keep it to provide it again indices = X.index X = pd.merge(left=X, right=X_time[['ds', 'yhat']], on='ds', how='left') X.index = indices save_obj(X[['yhat']], XX_path) remove(model_path) # indicates success, no longer need remove(X_path) # indicates success, no longer need return XX_path
def _fit_async(X_path, grp_hash, tmp_folder, params): """ Fits a FB Prophet model for a particular time group :param X_path: Path to the data used to fit the FB Prophet model :param grp_hash: Time group identifier :return: time group identifier and path to the pickled model """ np.random.seed(1234) random.seed(1234) X = load_obj(X_path) # Commented for performance, uncomment for debug # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if X.shape[0] < 20: # print("prophet - small data work-around for group: %s" % grp_hash) return grp_hash, None # Import FB Prophet package mod = importlib.import_module('fbprophet') Prophet = getattr(mod, "Prophet") model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True) if params["country_holidays"] is not None: model.add_country_holidays(country_name=params["country_holidays"]) if params["monthly_seasonality"]: model.add_seasonality(name='monthly', period=30.5, fourier_order=5) with suppress_stdout_stderr(): model.fit(X[['ds', 'y']]) model_path = os.path.join(tmp_folder, "fbprophet_model" + str(uuid.uuid4())) save_obj(model, model_path) remove(X_path) # remove to indicate success return grp_hash, model_path
def fit(self, X: dt.Frame, y: np.array = None): XX = X[:, self.tgc].to_pandas() XX = XX.replace([None, np.nan], 0) XX.rename(columns={self.time_column: "ds"}, inplace=True) if self.labels is not None: y = LabelEncoder().fit(self.labels).transform(y) XX['y'] = np.array(y) self.nan_value = np.mean(y) # TODO - store mean per group, not just global tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] self.models = {} num_tasks = len(XX_grp) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks) for key, X in XX_grp: X_path = os.path.join(temporary_files_path, "fbprophet_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) args = (X_path, grp_hash,) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_fit_async, args=args, kwargs=kwargs, out=self.models) pool.finish() for k, v in self.models.items(): self.models[k] = load_obj(v) if v is not None else None remove(v) return self
def _transform_async(model_path, X_path, nan_value, has_is_train_attr, time_column, tmp_folder): """ Predicts target for a particular time group :param model_path: path to the stored model :param X_path: Path to the data used to fit the ARIMA model :param nan_value: Value of target prior, used when no fitted model has been found :param has_is_train_attr: indicates if we predict in-sample or out-of-sample :param time_column: Name of the time column in the input data :return: self """ model = load_obj(model_path) XX_path = os.path.join(tmp_folder, "autoarima_XXt" + str(uuid.uuid4())) X = load_obj(X_path) # Arima returns the predictions ordered by time # So we should keep track of the time order for each group so that # predictions are ordered the same as the imput frame # Keep track of the order order = np.argsort(X[time_column]) if model is not None: yhat = model.predict_in_sample() \ if has_is_train_attr else model.predict(n_periods=X.shape[0]) yhat = yhat[order] XX = pd.DataFrame(yhat, columns=['yhat']) else: XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat']) # invalid model # Sync index XX.index = X.index assert XX.shape[1] == 1 save_obj(XX, XX_path) remove(model_path) # indicates success, no longer need remove(X_path) # indicates success, no longer need return XX_path
def _transform_async(model_path, X_path, nan_value): model = load_obj(model_path) XX_path = os.path.join(temporary_files_path, "fbprophet_XXt" + str(uuid.uuid4())) X = load_obj(X_path) if model is not None: # Facebook Prophet returns the predictions ordered by time # So we should keep track of the times for each group so that # predictions are ordered the same as the imput frame # Make a copy of the input dates X_ds = X.copy() X_ds['ds'] = pd.to_datetime(X_ds['ds']) # Predict with prophet, get the time and prediction and index by time as well # In the case date repeats inside of a group (this happens at least in acceptance test) # We groupby date and keep the max (prophet returns the same value for a given date) # XX will contain the predictions indexed by date XX = model.predict(X)[['ds', 'yhat']].groupby('ds').max() # Now put yhat in the right order, simply by maping the dates to the predictions X_ds['yhat'] = X_ds["ds"].map(XX['yhat']) # Now set XX back to the predictions and drop the index XX = X_ds[['yhat']].reset_index(drop=True) else: XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat']) # invalid models XX.index = X.index assert XX.shape[1] == 1 save_obj(XX, XX_path) remove(model_path) # indicates success, no longer need remove(X_path) # indicates success, no longer need return XX_path
def _fit_async(X_path, grp_hash, tmp_folder, params, cap): """ Fits a FB Prophet model for a particular time group :param X_path: Path to the data used to fit the FB Prophet model :param grp_hash: Time group identifier :return: time group identifier and path to the pickled model """ np.random.seed(1234) random.seed(1234) X = load_obj(X_path) # Commented for performance, uncomment for debug # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if X.shape[0] < 20: return grp_hash, None # Import FB Prophet package mod = importlib.import_module('fbprophet') Prophet = getattr(mod, "Prophet") # Fit current model and prior nrows = X[['ds', 'y']].shape[0] n_changepoints = max(1, int(nrows * (2 / 3))) if n_changepoints < 25: model = Prophet(growth=params["growth"], n_changepoints=n_changepoints) else: model = Prophet(growth=params["growth"]) # Add params if params["country_holidays"] is not None: model.add_country_holidays(country_name=params["country_holidays"]) if params["monthly_seasonality"]: model.add_seasonality(name='monthly', period=30.5, fourier_order=params["monthly_seasonality"]) if params["quarterly_seasonality"]: model.add_seasonality( name='quarterly', period=92, fourier_order=params["quarterly_seasonality"]) with suppress_stdout_stderr(): if params["growth"] == "logistic": X["cap"] = cap model.fit(X[['ds', 'y', 'cap']]) else: model.fit(X[['ds', 'y']]) model_path = os.path.join(tmp_folder, "fbprophet_model" + str(uuid.uuid4())) save_obj(model, model_path) remove(X_path) # remove to indicate success return grp_hash, model_path
def _fit_async(X_path, grp_hash, time_column): np.random.seed(1234) random.seed(1234) X = load_obj(X_path) pm = importlib.import_module('pmdarima') with suppress_stdout_stderr(): try: order = order = np.argsort(X[time_column]) model = pm.auto_arima(X['y'].values[order], error_action='ignore') except: model = None model_path = os.path.join(temporary_files_path, "autoarima_model" + str(uuid.uuid4())) save_obj(model, model_path) remove(X_path) # remove to indicate success return grp_hash, model_path
def _transform_async(model_path, X_path, nan_value): model = load_obj(model_path) XX_path = os.path.join(temporary_files_path, "fbprophet_XXt" + str(uuid.uuid4())) X = load_obj(X_path) if model is not None: XX = model.predict(X[['ds']])[['yhat']] else: XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat']) # invalid models XX.index = X.index assert XX.shape[1] == 1 save_obj(XX, XX_path) remove(model_path) # indicates success, no longer need remove(X_path) # indicates success, no longer need return XX_path
def _fit_async(X_path, grp_hash): np.random.seed(1234) random.seed(1234) X = load_obj(X_path) # Commented for performance, uncomment for debug # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if X.shape[0] < 20: # print("prophet - small data work-around for group: %s" % grp_hash) return grp_hash, None mod = importlib.import_module('fbprophet') Prophet = getattr(mod, "Prophet") model = Prophet() with suppress_stdout_stderr(): model.fit(X[['ds', 'y']]) model_path = os.path.join(temporary_files_path, "fbprophet_model" + str(uuid.uuid4())) save_obj(model, model_path) remove(X_path) # remove to indicate success return grp_hash, model_path
def _fit_async(data_path, grp_hash, tmp_folder, params): """ Fits a FB Prophet model for a particular time group :param data_path: Path to the data used to fit the FB Prophet model :param grp_hash: Time group identifier :return: time group identifier and path to the pickled model """ np.random.seed(1234) random.seed(1234) X = load_obj(data_path) # if X.shape[0] < 20: # return grp_hash, None # Import FB Prophet package mod = importlib.import_module('fbprophet') Prophet = getattr(mod, "Prophet") model = fit_prophet_model(Prophet, X, params) model_path = os.path.join(tmp_folder, "fbprophet_model" + str(uuid.uuid4())) save_obj(model, model_path) remove(data_path) # remove to indicate success return grp_hash, model_path
def fit(self, X: dt.Frame, y: np.array = None): pm = importlib.import_module('pmdarima') self.models = {} X = X.to_pandas() XX = X[self.tgc].copy() XX['y'] = np.array(y) self.nan_value = np.mean(y) self.ntrain = X.shape[0] tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] num_tasks = len(XX_grp) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks) nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): if (_i_g + 1) % max(1, nb_groups // 20) == 0: print("Auto ARIMA - ", 100 * (_i_g + 1) // nb_groups, " %% of Groups Fitted") X_path = os.path.join(temporary_files_path, "autoarima_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) args = (X_path, grp_hash, self.time_column,) kwargs = {} pool.submit_tryget(None, MyParallelAutoArimaTransformer_fit_async, args=args, kwargs=kwargs, out=self.models) pool.finish() for k, v in self.models.items(): self.models[k] = load_obj(v) if v is not None else None remove(v) return self
def transform(self, X: dt.Frame): X = X.to_pandas() X = X.replace([None, np.nan], 0) XX = X[self.tgc].copy() XX.rename(columns={self.time_column: "ds"}, inplace=True) tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] assert len(XX_grp) > 0 num_tasks = len(XX_grp) def processor(out, res): out.append(res) pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, max_workers=self.n_jobs, num_tasks=num_tasks) XX_paths = [] model_paths = [] for key, X in XX_grp: key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) X_path = os.path.join(temporary_files_path, "fbprophet_Xt" + str(uuid.uuid4())) print("prophet - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if grp_hash in self.models: model = self.models[grp_hash] model_path = os.path.join( temporary_files_path, "fbprophet_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X, X_path) model_paths.append(model_path) args = (model_path, X_path, self.nan_value) kwargs = {} pool.submit_tryget( None, MyParallelProphetTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) else: XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # unseen groups save_obj(XX, X_path) XX_paths.append(X_path) pool.finish() XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) return XX
def _transform_async(model_path, X_path, nan_value): model = load_obj(model_path) XX_path = os.path.join(temporary_files_path, "fbprophet_XXt" + str(uuid.uuid4())) X = load_obj(X_path) # Facebook Prophet returns the predictions ordered by time # So we should keep track of the time order for each group so that # predictions are ordered the same as the imput frame # Keep track of the order order = np.argsort(pd.to_datetime(X["ds"])) if model is not None: # Run prophet yhat = model.predict(X)['yhat'].values XX = pd.DataFrame(yhat, columns=['yhat']) else: XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat']) # invalid models XX.index = X.index[order] assert XX.shape[1] == 1 save_obj(XX, XX_path) remove(model_path) # indicates success, no longer need remove(X_path) # indicates success, no longer need return XX_path
def transform(self, X: dt.Frame): X = X.to_pandas() XX = X[self.tgc].copy() tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] assert len(XX_grp) > 0 num_tasks = len(XX_grp) def processor(out, res): out.append(res) pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks) XX_paths = [] model_paths = [] nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): if (_i_g + 1) % max(1, nb_groups // 20) == 0: print("Auto ARIMA - ", 100 * (_i_g + 1) // nb_groups, " %% of Groups Transformed") key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) X_path = os.path.join(temporary_files_path, "autoarima_Xt" + str(uuid.uuid4())) # Commented for performance, uncomment for debug # print("prophet - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if grp_hash in self.models: model = self.models[grp_hash] model_path = os.path.join(temporary_files_path, "autoarima_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X, X_path) model_paths.append(model_path) args = (model_path, X_path, self.nan_value, hasattr(self, 'is_train'), self.time_column,) kwargs = {} pool.submit_tryget(None, MyParallelAutoArimaTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) else: # Don't go through pools XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # unseen groups # Sync indices XX.index = X.index save_obj(XX, X_path) XX_paths.append(X_path) pool.finish() XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) return XX
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): # Get TGC and time column self.tgc = self.params_base.get('tgc', None) self.time_column = self.params_base.get('time_column', None) self.nan_value = np.mean(y) self.cap = np.max( y ) * 1.5 # TODO Don't like this we should compute a cap from average yearly growth self.prior = np.mean(y) if self.time_column is None: self.time_column = self.tgc[0] # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) loggerinfo( logger, "Start Fitting Prophet Model with params : {}".format(self.params)) # Get temporary folders for multi process communication tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Convert to pandas XX = X[:, self.tgc].to_pandas() XX = XX.replace([None, np.nan], 0) XX.rename(columns={self.time_column: "ds"}, inplace=True) # Make target available in the Frame XX['y'] = np.array(y) # Set target prior self.nan_value = np.mean(y) # Group the input by TGC (Time group column) excluding the time column itself tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] self.models = {} self.priors = {} # Prepare for multi processing num_tasks = len(XX_grp) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool loggerdebug(logger, "Prophet will use {} workers for fitting".format(n_jobs)) pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) # Fit 1 FB Prophet model per time group columns nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "FB Prophet : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) X_path = os.path.join(tmp_folder, "fbprophet_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) self.priors[grp_hash] = X['y'].mean() args = (X_path, grp_hash, tmp_folder, self.params, self.cap) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_fit_async, args=args, kwargs=kwargs, out=self.models) pool.finish() for k, v in self.models.items(): self.models[k] = load_obj(v) if v is not None else None remove(v) self._clean_tmp_folder(logger, tmp_folder) return None
def fit(self, X: dt.Frame, y: np.array = None, **kwargs): """ Fits ARIMA models (1 per time group) using historical target values contained in y Model fitting is distributed over a pool of processes and uses file storage to share the data with workers :param X: Datatable frame containing the features :param y: numpy array containing the historical values of the target :return: self """ # Get the logger if it exists logger = None tmp_folder = str(uuid.uuid4()) + "_arima_folder/" if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Import the ARIMA python module pm = importlib.import_module('pmdarima') # Init models self.models = {} # Convert to pandas X = X.to_pandas() XX = X[self.tgc].copy() XX['y'] = np.array(y) self.nan_value = np.mean(y) self.ntrain = X.shape[0] # Group the input by TGC (Time group column) excluding the time column itself tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] # Prepare for multi processing num_tasks = len(XX_grp) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool loggerinfo( logger, "Arima will use {} workers for parallel processing".format(n_jobs)) pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) # Build 1 ARIMA model per time group columns nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just say where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "Auto ARIMA : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) X_path = os.path.join(tmp_folder, "autoarima_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) args = (X_path, grp_hash, self.time_column, tmp_folder) kwargs = {} pool.submit_tryget(None, MyParallelAutoArimaTransformer_fit_async, args=args, kwargs=kwargs, out=self.models) pool.finish() for k, v in self.models.items(): self.models[k] = load_obj(v) if v is not None else None remove(v) self._clean_tmp_folder(logger, tmp_folder) return self
def transform(self, X: dt.Frame, **kwargs): """ Uses fitted models (1 per time group) to predict the target If self.is_train exists, it means we are doing in-sample predictions if it does not then we Arima is used to predict the future :param X: Datatable Frame containing the features :return: ARIMA predictions """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) X = X.to_pandas() XX = X[self.tgc].copy() tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] assert len(XX_grp) > 0 num_tasks = len(XX_grp) def processor(out, res): out.append(res) pool_to_use = small_job_pool loggerinfo(logger, "Arima will use {} workers for transform".format(n_jobs)) pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) XX_paths = [] model_paths = [] nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just print where we are in the process of fitting models if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "Auto ARIMA : %d%% of groups transformed" % (100 * (_i_g + 1) // nb_groups)) # Create time group key to store and retrieve fitted models key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) # Create file path to store data and pass it to the fitting pool X_path = os.path.join(tmp_folder, "autoarima_Xt" + str(uuid.uuid4())) # Commented for performance, uncomment for debug # print("ARIMA - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if grp_hash in self.models: model = self.models[grp_hash] model_path = os.path.join( tmp_folder, "autoarima_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X, X_path) model_paths.append(model_path) args = (model_path, X_path, self.nan_value, hasattr(self, 'is_train'), self.time_column, self.pred_gap, tmp_folder) kwargs = {} pool.submit_tryget( None, MyParallelAutoArimaTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) else: # Don't go through pools XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # unseen groups # Sync indices XX.index = X.index save_obj(XX, X_path) XX_paths.append(X_path) pool.finish() XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) self._clean_tmp_folder(logger, tmp_folder) return XX
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): logger = None if self._make_logger: # Example use of logger, with required import of: # from h2oaicore.systemutils import make_experiment_logger, loggerinfo # Can use loggerwarning, loggererror, etc. for different levels if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) if self._show_logger_test: loggerinfo(logger, "TestLOGGER: Fit CatBoost") if self._show_task_test: # Example task sync operations if hasattr(self, 'testcount'): self.test_count += 1 else: self.test_count = 0 # The below generates a message in the GUI notifications panel if self.test_count == 0 and self.context and self.context.experiment_id: warning = "TestWarning: First CatBoost fit for this model instance" loggerwarning(logger, warning) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='warning', data=warning)) task.flush() # The below generates a message in the GUI top-middle panel above the progress wheel if self.test_count == 0 and self.context and self.context.experiment_id: message = "Tuning CatBoost" loggerinfo(logger, message) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='update', message=message)) task.flush() from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType # label encode target and setup type of problem lb = LabelEncoder() if self.num_classes >= 2: lb.fit(self.labels) y = lb.transform(y) if eval_set is not None: valid_X = eval_set[0][0] valid_y = eval_set[0][1] valid_y = lb.transform(valid_y) eval_set = [(valid_X, valid_y)] self.params.update({'objective': 'Logloss'}) if self.num_classes > 2: self.params.update({'objective': 'MultiClass'}) if isinstance(X, dt.Frame): orig_cols = list(X.names) numeric_cols = list(X[:, [bool, int, float]].names) else: orig_cols = list(X.columns) numeric_cols = list(X.select_dtypes([np.number]).columns) # unlike lightgbm that needs label encoded categoricals, catboots can take raw strings etc. self.params['cat_features'] = [ i for i, x in enumerate(orig_cols) if 'CatOrig:' in x or 'Cat:' in x or x not in numeric_cols ] if not self.get_uses_gpus(self.params): # monotonicity constraints not available for GPU for catboost # get names of columns in same order X_names = list(dt.Frame(X).names) X_numeric = self.get_X_ordered_numerics(X) X_numeric_names = list(X_numeric.names) _, _, constraints, self.set_monotone_constraints(X=X_numeric, y=y) # if non-numerics, then fix those to have 0 constraint self.params['monotone_constraints'] = [0] * len(X_names) colnumi = 0 for coli in X_names: if X_names[coli] in X_numeric_names: self.params['monotone_constraints'][coli] = constraints[ colnumi] colnumi += 1 if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0: # dt -> catboost internally using buffer leaks, so convert here # assume predict is after pipeline collection or in subprocess so needs no protection X = X.to_numpy( ) # don't assign back to X so don't damage during predict X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64) if eval_set is not None: valid_X = eval_set[0][0].to_numpy( ) # don't assign back to X so don't damage during predict valid_X = np.ascontiguousarray( valid_X, dtype=np.float32 if config.data_precision == "float32" else np.float64) valid_y = eval_set[0][1] eval_set = [(valid_X, valid_y)] if eval_set is not None: valid_X_shape = eval_set[0][0].shape else: valid_X_shape = None X, eval_set = self.process_cats(X, eval_set, orig_cols) # modify self.params_base['gpu_id'] based upon actually-available GPU based upon training and valid shapes self.acquire_gpus_function(train_shape=X.shape, valid_shape=valid_X_shape) params = copy.deepcopy( self.params ) # keep separate, since then can be pulled form lightgbm params params = self.transcribe_params(params=params, **kwargs) if logger is not None: loggerdata( logger, "CatBoost parameters: params_base : %s params: %s catboost_params: %s" % (str(self.params_base), str(self.params), str(params))) if self.num_classes == 1: self.model = CatBoostRegressor(**params) else: self.model = CatBoostClassifier(**params) # Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored. if self.num_classes == 1: # assume not mae, which would use median # baseline = [np.mean(y)] * len(y) baseline = None else: baseline = None kwargs_fit = dict(baseline=baseline, eval_set=eval_set) pickle_path = None if config.debug_daimodel_level >= 2: self.uuid = str(uuid.uuid4())[:6] pickle_path = os.path.join(exp_dir(), "catboost%s.tmp.pickle" % self.uuid) save_obj((self.model, X, y, sample_weight, kwargs_fit), pickle_path) # FIT (with migration safety before hyperopt/Optuna function added) try: if hasattr(self, 'dask_or_hyper_or_normal_fit'): self.dask_or_hyper_or_normal_fit(X, y, sample_weight=sample_weight, kwargs=kwargs, **kwargs_fit) else: self.model.fit(X, y, sample_weight=sample_weight, **kwargs_fit) except Exception as e: if "All features are either constant or ignored" in str(e): raise IgnoreEntirelyError(str(e)) raise if config.debug_daimodel_level <= 2: remove(pickle_path) # https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html # need to move to wrapper if self.model.get_best_iteration() is not None: iterations = self.model.get_best_iteration() + 1 else: iterations = self.params['n_estimators'] # must always set best_iterations self.model_path = None importances = copy.deepcopy(self.model.feature_importances_) if not self._save_by_pickle: self.uuid = str(uuid.uuid4())[:6] model_file = "catboost_%s.bin" % str(self.uuid) self.model_path = os.path.join(self.context.experiment_tmp_dir, model_file) self.model.save_model(self.model_path) with open(self.model_path, mode='rb') as f: model = f.read() else: model = self.model self.set_model_properties( model= model, # overwrites self.model object with bytes if not using pickle features=orig_cols, importances=importances, iterations=iterations)
def transform(self, X: dt.Frame, **kwargs): """ Uses fitted models (1 per time group) to predict the target :param X: Datatable Frame containing the features :return: FB Prophet predictions """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir ) tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Reduce X to TGC tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) X = X[:, self.tgc].to_pandas() # Fill NaNs or None X = X.replace([None, np.nan], 0) # Change date feature name to match Prophet requirements X.rename(columns={self.time_column: "ds"}, inplace=True) # Predict y using unique dates X_time = X[['ds']].groupby('ds').first().reset_index() with suppress_stdout_stderr(): y_avg = self.model.predict(X_time)[['ds', 'yhat']] # Prophet transforms the date column to datetime so we need to transfrom that to merge back X_time.sort_values('ds', inplace=True) X_time['yhat'] = y_avg['yhat'] X_time.sort_index(inplace=True) # Merge back into original frame on 'ds' # pd.merge wipes the index ... so keep it to provide it again indices = X.index X = pd.merge( left=X, right=X_time[['ds', 'yhat']], on='ds', how='left' ) X.index = indices # Go through groups and recover the scaled target for knowed groups if len(tgc_wo_time) > 0: X_groups = X.groupby(tgc_wo_time) else: X_groups = [([None], X)] inverted_ys = [] for key, X_grp in X_groups: grp_hash = self.get_hash(key) # Scale target for current group if grp_hash in self.scalers.keys(): inverted_y = self.scalers[grp_hash].inverse_transform(X_grp[['yhat']]) else: inverted_y = self.general_scaler.inverse_transform(X_grp[['yhat']]) # Put back in a DataFrame to keep track of original index inverted_df = pd.DataFrame(inverted_y, columns=['yhat']) inverted_df.index = X_grp.index inverted_ys.append(inverted_df) XX_general = pd.concat(tuple(inverted_ys), axis=0).sort_index() if self.top_groups: # Go though the groups and predict only top XX_paths = [] model_paths = [] def processor(out, res): out.append(res) num_tasks = len(self.top_groups) pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) nb_groups = len(X_groups) for _i_g, (key, X_grp) in enumerate(X_groups): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo(logger, "FB Prophet : %d%% of groups predicted" % (100 * (_i_g + 1) // nb_groups)) # Create dict key to store the min max scaler grp_hash = self.get_hash(key) X_path = os.path.join(tmp_folder, "fbprophet_Xt" + str(uuid.uuid4())) if grp_hash not in self.top_groups: XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat']) # unseen groups XX.index = X_grp.index save_obj(XX, X_path) XX_paths.append(X_path) continue if self.grp_models[grp_hash] is None: XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat']) # unseen groups XX.index = X_grp.index save_obj(XX, X_path) XX_paths.append(X_path) continue model = self.grp_models[grp_hash] model_path = os.path.join(tmp_folder, "fbprophet_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X_grp, X_path) model_paths.append(model_path) args = (model_path, X_path, self.priors[grp_hash], tmp_folder) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) pool.finish() XX_top_groups = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) self._clean_tmp_folder(logger, tmp_folder) features_df = pd.DataFrame() features_df[self.display_name + '_GrpAvg'] = XX_general['yhat'] if self.top_groups: features_df[self.display_name + f'_Top{self.top_n}Grp'] = XX_top_groups['yhat'] self._output_feature_names = list(features_df.columns) self._feature_desc = list(features_df.columns) return features_df
def predict(self, X, y=None, **kwargs): model, features, importances, iterations = self.get_model_properties() if not self._save_by_pickle: from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType if self.num_classes >= 2: from_file = CatBoostClassifier() else: from_file = CatBoostRegressor() with open(self.model_path, mode='wb') as f: f.write(model) model = from_file.load_model(self.model_path) # FIXME: Do equivalent throttling of predict size like def _predict_internal(self, X, **kwargs), wrap-up. if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0: # dt -> lightgbm internally using buffer leaks, so convert here # assume predict is after pipeline collection or in subprocess so needs no protection X = X.to_numpy( ) # don't assign back to X so don't damage during predict X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64) X, eval_set = self.process_cats(X, None, self.feature_names_fitted) pred_contribs = kwargs.get('pred_contribs', False) output_margin = kwargs.get('output_margin', False) fast_approx = kwargs.pop('fast_approx', False) if fast_approx: iterations = min(config.fast_approx_num_trees, iterations) # implicit import from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType, Pool n_jobs = max(1, physical_cores_count) if not pred_contribs and not output_margin: if self.num_classes >= 2: preds = model.predict_proba( X, ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) if preds.shape[1] == 2: return preds[:, 1] else: return preds else: return model.predict( X, ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) elif output_margin: # uses "predict" for raw for any class preds = model.predict( X, prediction_type="RawFormulaVal", ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) if len(preds.shape ) > 1 and preds.shape[1] == 2 and self.num_classes == 2: return preds[:, 1] else: return preds elif pred_contribs: # For Shapley, doesn't come from predict # For regression/binary, shap is shape of (rows, features + bias) # for multiclass, shap is shape of (rows, classes, features + bias) data = Pool(X, label=y, cat_features=self.params['cat_features']) if fast_approx: # https://github.com/catboost/catboost/issues/1146 # https://github.com/catboost/catboost/issues/1535 # can't specify trees, but they have approx version # Regular, Exact, or Approximate shap_calc_type = "Approximate" else: shap_calc_type = "Regular" # See also shap_mode # help(CatBoostClassifier.get_feature_importance) print_debug("shap_calc_type: %s" % shap_calc_type) pickle_path = None if config.debug_daimodel_level >= 2: self.uuid = str(uuid.uuid4())[:6] pickle_path = os.path.join( exp_dir(), "catboost_shappredict%s.tmp.pickle" % self.uuid) model.save_model( os.path.join(exp_dir(), "catshapproblem%s.catboost.model" % self.uuid)) # save_obj((self, self.model, model, X, y, kwargs, shap_calc_type, self.params['cat_features']), pickle_path) save_obj((model, X, y, kwargs, shap_calc_type, self.params['cat_features']), pickle_path) preds_shap = model.get_feature_importance( data=data, thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported, type=EFstrType.ShapValues, shap_calc_type=shap_calc_type, ) # repair broken shap sum: https://github.com/catboost/catboost/issues/1125 print_debug("shap_fix") preds_raw = model.predict( X, prediction_type="RawFormulaVal", ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) if self.num_classes <= 2: axis = 1 else: axis = 2 orig_sum = np.sum(preds_shap, axis=axis) print_debug("shap_fix2") # avoid division by 0, need different trick, e.g. change baseline, to fix that case if axis == 1: orig_sum[orig_sum[:] == 0.0] = 1.0 preds_shap = preds_shap * preds_raw[:, None] / orig_sum[:, None] else: # each feature and each class must sum up orig_sum[orig_sum[:, :] == 0.0] = 1.0 preds_shap = preds_shap * preds_raw[:, :, None] / orig_sum[:, :, None] if config.hard_asserts and config.debug_daimodel_level >= 2: print_debug("shap_check") model.save_model(os.path.join(exp_dir(), "catshapproblem")) pickle.dump((X, y, self.params['cat_features']), open(os.path.join(exp_dir(), "catshapproblem.pkl"), "wb")) preds_raw = model.predict( X, prediction_type="RawFormulaVal", ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) assert np.isclose(preds_raw, np.sum( preds_shap, axis=axis)).all( ), "catboost shapley does not sum up correctly" if config.debug_daimodel_level <= 2: remove(pickle_path) if axis == 1: return preds_shap else: # DAI expects (shape rows) * (classes x (features + 1)) with "columns" as blocks of # feature_0_class_0 feature_0_class_0 ... feature_0_class_1 feature_1_class_1 ... return preds_shap.reshape( preds_shap.shape[0], preds_shap.shape[1] * preds_shap.shape[2]) else: raise RuntimeError("No such case")
def fit(self, X: dt.Frame, y: np.array = None, **kwargs): """ Fits FB Prophet models (1 per time group) using historical target values contained in y Model fitting is distributed over a pool of processes and uses file storage to share the data with workers :param X: Datatable frame containing the features :param y: numpy array containing the historical values of the target :return: self """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir ) try: # Add value of prophet_top_n in recipe_dict variable inside of config.toml file # eg1: recipe_dict="{'prophet_top_n': 200}" # eg2: recipe_dict="{'prophet_top_n':10}" self.top_n = config.recipe_dict['prophet_top_n'] except KeyError: self.top_n = 50 loggerinfo(logger, f"Prophet will use {self.top_n} groups as well as average target data.") tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Reduce X to TGC tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) X = X[:, self.tgc].to_pandas() # Fill NaNs or None X = X.replace([None, np.nan], 0) # Add target, Label encoder is only used for Classif. which we don't support... if self.labels is not None: y = LabelEncoder().fit(self.labels).transform(y) X['y'] = np.array(y) self.nan_value = X['y'].mean() # Change date feature name to match Prophet requirements X.rename(columns={self.time_column: "ds"}, inplace=True) # Create a general scale now that will be used for unknown groups at prediction time # Can we do smarter than that ? self.general_scaler = MinMaxScaler().fit(X[['y', 'ds']].groupby('ds').median().values) # Go through groups and standard scale them if len(tgc_wo_time) > 0: X_groups = X.groupby(tgc_wo_time) else: X_groups = [([None], X)] self.scalers = {} scaled_ys = [] print(f'{datetime.now()} Start of group scaling') for key, X_grp in X_groups: # Create dict key to store the min max scaler grp_hash = self.get_hash(key) # Scale target for current group self.scalers[grp_hash] = MinMaxScaler() y_skl = self.scalers[grp_hash].fit_transform(X_grp[['y']].values) # Put back in a DataFrame to keep track of original index y_skl_df = pd.DataFrame(y_skl, columns=['y']) # (0, 'A') (1, 4) (100, 1) (100, 1) # print(grp_hash, X_grp.shape, y_skl.shape, y_skl_df.shape) y_skl_df.index = X_grp.index scaled_ys.append(y_skl_df) print(f'{datetime.now()} End of group scaling') # Set target back in original frame but keep original X['y_orig'] = X['y'] X['y'] = pd.concat(tuple(scaled_ys), axis=0) # Now Average groups X_avg = X[['ds', 'y']].groupby('ds').mean().reset_index() # Send that to Prophet params = { "country_holidays": self.country_holidays, "monthly_seasonality": self.monthly_seasonality } mod = importlib.import_module('fbprophet') Prophet = getattr(mod, "Prophet") self.model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True) if params["country_holidays"] is not None: self.model.add_country_holidays(country_name=params["country_holidays"]) if params["monthly_seasonality"]: self.model.add_seasonality(name='monthly', period=30.5, fourier_order=5) with suppress_stdout_stderr(): self.model.fit(X[['ds', 'y']]) print(f'{datetime.now()} General Model Fitted') self.top_groups = None if len(tgc_wo_time) > 0: if self.top_n > 0: top_n_grp = X.groupby(tgc_wo_time).size().sort_values().reset_index()[tgc_wo_time].iloc[-self.top_n:].values self.top_groups = [ '_'.join(map(str, key)) for key in top_n_grp ] if self.top_groups: self.grp_models = {} self.priors = {} # Prepare for multi processing num_tasks = len(self.top_groups) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool loggerinfo(logger, f"Prophet will use {n_jobs} workers for fitting.") loggerinfo(logger, "Prophet parameters holidays {} / monthly {}".format(self.country_holidays, self.monthly_seasonality)) pool = pool_to_use( logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs ) # # Fit 1 FB Prophet model per time group columns nb_groups = len(X_groups) # Put y back to its unscaled value for top groups X['y'] = X['y_orig'] for _i_g, (key, X) in enumerate(X_groups): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo(logger, "FB Prophet : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) X_path = os.path.join(tmp_folder, "fbprophet_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) grp_hash = self.get_hash(key) if grp_hash not in self.top_groups: continue self.priors[grp_hash] = X['y'].mean() params = { "country_holidays": self.country_holidays, "monthly_seasonality": self.monthly_seasonality } args = (X_path, grp_hash, tmp_folder, params) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_fit_async, args=args, kwargs=kwargs, out=self.grp_models) pool.finish() for k, v in self.grp_models.items(): self.grp_models[k] = load_obj(v) if v is not None else None remove(v) self._clean_tmp_folder(logger, tmp_folder) return self
def transform(self, X: dt.Frame, **kwargs): """ Uses fitted models (1 per time group) to predict the target :param X: Datatable Frame containing the features :return: FB Prophet predictions """ # Get the logger if it exists logger = self.get_experiment_logger() tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Reduce X to TGC tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) # Change date feature name to match Prophet requirements X = self.convert_to_prophet(X) y_predictions = self.predict_with_average_model(X, tgc_wo_time) y_predictions.columns = ['average_pred'] # Go through groups for grp_col in tgc_wo_time: # Get the unique dates to be predicted X_groups = X[['ds', grp_col]].groupby(grp_col) # Go though the groups and predict only top XX_paths = [] model_paths = [] def processor(out, res): out.append(res) num_tasks = len(X_groups) pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) for _i_g, (key, X_grp) in enumerate(X_groups): # Just log where we are in the fitting process if (_i_g + 1) % max(1, num_tasks // 20) == 0: loggerinfo( logger, "FB Prophet : %d%% of groups predicted" % (100 * (_i_g + 1) // num_tasks)) # Create dict key to store the min max scaler grp_hash = self.get_hash(key) X_path = os.path.join(tmp_folder, "fbprophet_Xt" + str(uuid.uuid4())) if grp_hash not in self.grp_models[grp_col]: # unseen groups XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat']) XX.index = X_grp.index save_obj(XX, X_path) XX_paths.append(X_path) continue if self.grp_models[grp_col][grp_hash] is None: # known groups but not enough train data XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat']) XX.index = X_grp.index save_obj(XX, X_path) XX_paths.append(X_path) continue model = self.grp_models[grp_col][grp_hash] model_path = os.path.join( tmp_folder, "fbprophet_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X_grp, X_path) model_paths.append(model_path) args = (model_path, X_path, self.priors[grp_col][grp_hash], tmp_folder) kwargs = {} pool.submit_tryget( None, MyProphetOnSingleGroupsTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) pool.finish() y_predictions[f'{grp_col}_pred'] = pd.concat( (load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) # Now we can invert scale # But first get rid of NaNs for grp_col in tgc_wo_time: # Add time group to the predictions, will be used to invert scaling y_predictions[grp_col] = X[grp_col] # Fill NaN y_predictions[f'{grp_col}_pred'] = y_predictions[ f'{grp_col}_pred'].fillna(y_predictions['average_pred']) # Go through groups and recover the scaled target for knowed groups if len(tgc_wo_time) > 0: X_groups = y_predictions.groupby(tgc_wo_time) else: X_groups = [([None], y_predictions)] for _f in [f'{grp_col}_pred' for grp_col in tgc_wo_time] + ['average_pred']: inverted_ys = [] for key, X_grp in X_groups: grp_hash = self.get_hash(key) # Scale target for current group if grp_hash in self.scalers.keys(): inverted_y = self.scalers[grp_hash].inverse_transform( X_grp[[_f]]) else: inverted_y = self.general_scaler.inverse_transform( X_grp[[_f]]) # Put back in a DataFrame to keep track of original index inverted_df = pd.DataFrame(inverted_y, columns=[_f]) inverted_df.index = X_grp.index inverted_ys.append(inverted_df) y_predictions[_f] = pd.concat(tuple(inverted_ys), axis=0).sort_index()[_f] self._clean_tmp_folder(logger, tmp_folder) y_predictions.drop(tgc_wo_time, axis=1, inplace=True) self._output_feature_names = [ f'{self.display_name}_{_f}' for _f in y_predictions ] self._feature_desc = [ f'{self.display_name}_{_f}' for _f in y_predictions ] return y_predictions
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): # Get TGC and time column self.tgc = self.params_base.get('tgc', None) self.time_column = self.params_base.get('time_column', None) self.nan_value = np.mean(y) self.cap = np.max( y ) * 1.5 # TODO Don't like this we should compute a cap from average yearly growth self.prior = np.mean(y) if self.time_column is None: self.time_column = self.tgc[0] # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) loggerinfo( logger, "Start Fitting Prophet Model with params : {}".format(self.params)) try: # Add value of prophet_top_n in recipe_dict variable inside of config.toml file # eg1: recipe_dict="{'prophet_top_n': 200}" # eg2: recipe_dict="{'prophet_top_n':10}" self.top_n = config.recipe_dict['prophet_top_n'] except KeyError: self.top_n = 50 loggerinfo( logger, f"Prophet will use {self.top_n} groups as well as average target data." ) # Get temporary folders for multi process communication tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Reduce X to TGC tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) X = X[:, self.tgc].to_pandas() # Fill NaNs or None X = X.replace([None, np.nan], 0) # Add target, Label encoder is only used for Classif. which we don't support... if self.labels is not None: y = LabelEncoder().fit(self.labels).transform(y) X['y'] = np.array(y) self.nan_value = X['y'].mean() # Change date feature name to match Prophet requirements X.rename(columns={self.time_column: "ds"}, inplace=True) # Create a general scale now that will be used for unknown groups at prediction time # Can we do smarter than that ? general_scaler = MinMaxScaler().fit( X[['y', 'ds']].groupby('ds').median().values) # Go through groups and standard scale them if len(tgc_wo_time) > 0: X_groups = X.groupby(tgc_wo_time) else: X_groups = [([None], X)] scalers = {} scaled_ys = [] print('Number of groups : ', len(X_groups)) for g in tgc_wo_time: print(f'Number of groups in {g} groups : {X[g].unique().shape}') for key, X_grp in X_groups: # Create dict key to store the min max scaler grp_hash = self.get_hash(key) # Scale target for current group scalers[grp_hash] = MinMaxScaler() y_skl = scalers[grp_hash].fit_transform(X_grp[['y']].values) # Put back in a DataFrame to keep track of original index y_skl_df = pd.DataFrame(y_skl, columns=['y']) y_skl_df.index = X_grp.index scaled_ys.append(y_skl_df) # Set target back in original frame but keep original X['y_orig'] = X['y'] X['y'] = pd.concat(tuple(scaled_ys), axis=0) # Now Average groups X_avg = X[['ds', 'y']].groupby('ds').mean().reset_index() # Send that to Prophet mod = importlib.import_module('fbprophet') Prophet = getattr(mod, "Prophet") nrows = X[['ds', 'y']].shape[0] n_changepoints = max(1, int(nrows * (2 / 3))) if n_changepoints < 25: model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True, n_changepoints=n_changepoints) else: model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True) if self.params["country_holidays"] is not None: model.add_country_holidays( country_name=self.params["country_holidays"]) if self.params["monthly_seasonality"]: model.add_seasonality( name='monthly', period=30.5, fourier_order=self.params["monthly_seasonality"]) if self.params["quarterly_seasonality"]: model.add_seasonality( name='quarterly', period=92, fourier_order=self.params["quarterly_seasonality"]) with suppress_stdout_stderr(): model.fit(X[['ds', 'y']]) top_groups = None if len(tgc_wo_time) > 0: if self.top_n > 0: top_n_grp = X.groupby(tgc_wo_time).size().sort_values( ).reset_index()[tgc_wo_time].iloc[-self.top_n:].values top_groups = ['_'.join(map(str, key)) for key in top_n_grp] grp_models = {} priors = {} if top_groups: # Prepare for multi processing num_tasks = len(top_groups) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool loggerinfo(logger, f"Prophet will use {n_jobs} workers for fitting.") pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) # # Fit 1 FB Prophet model per time group columns nb_groups = len(X_groups) # Put y back to its unscaled value for top groups X['y'] = X['y_orig'] for _i_g, (key, X) in enumerate(X_groups): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "FB Prophet : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) X_path = os.path.join(tmp_folder, "fbprophet_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) grp_hash = self.get_hash(key) if grp_hash not in top_groups: continue priors[grp_hash] = X['y'].mean() args = (X_path, grp_hash, tmp_folder, self.params, self.cap) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_fit_async, args=args, kwargs=kwargs, out=grp_models) pool.finish() for k, v in grp_models.items(): grp_models[k] = load_obj(v) if v is not None else None remove(v) self._clean_tmp_folder(logger, tmp_folder) self.set_model_properties( model={ 'avg': model, 'group': grp_models, 'priors': priors, 'topgroups': top_groups, 'skl': scalers, 'gen_scaler': general_scaler }, features=self.tgc, # Prophet uses time and timegroups importances=np.ones(len(self.tgc)), iterations=-1 # Does not have iterations ) return None
def fit(self, X: dt.Frame, y: np.array = None, **kwargs): """ Fits FB Prophet models (1 per time group) using historical target values contained in y Model fitting is distributed over a pool of processes and uses file storage to share the data with workers :param X: Datatable frame containing the features :param y: numpy array containing the historical values of the target :return: self """ # Get the logger if it exists logger = self.get_experiment_logger() loggerinfo( logger, f"Prophet will use individual groups as well as average target data." ) tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Reduce X to TGC tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) X = self.convert_to_prophet(X) # Add target, Label encoder is only used for Classif. which we don't support... if self.labels is not None: y = LabelEncoder().fit(self.labels).transform(y) X['y'] = np.array(y) self.prior_value = X['y'].mean() self.general_scaler = self.fit_scaler_to_median_target(X) X = self.scale_target_for_each_time_group(X, tgc_wo_time) self.avg_model = self.fit_prophet_model_on_average_target(X) # Go through individual time group columns and create avg models self.grp_models = {} self.priors = {} for grp_col in tgc_wo_time: self.grp_models[grp_col] = {} self.priors[grp_col] = {} X_groups = X[['ds', 'y', grp_col]].groupby(grp_col) nb_groups = len(X_groups) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool loggerinfo(logger, f"Prophet will use {n_jobs} workers for fitting.") loggerinfo( logger, "Prophet parameters holidays {} / monthly {}".format( self.country_holidays, self.monthly_seasonality)) pool = pool_to_use(logger=None, processor=processor, num_tasks=nb_groups, max_workers=n_jobs) for _i_g, (key, X_grp) in enumerate(X_groups): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "FB Prophet : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) X_path = os.path.join(tmp_folder, "fbprophet_X" + str(uuid.uuid4())) # Save target average for current group grp_hash = self.get_hash(key) self.priors[grp_col][grp_hash] = X_grp['y'].mean() # Average by date X_grp_avg = X_grp.groupby('ds')['y'].mean().reset_index() save_obj(X_grp_avg, X_path) params = { "country_holidays": self.country_holidays, "monthly_seasonality": self.monthly_seasonality } args = (X_path, grp_hash, tmp_folder, params) kwargs = {} pool.submit_tryget( None, MyProphetOnSingleGroupsTransformer_fit_async, args=args, kwargs=kwargs, out=self.grp_models[grp_col]) pool.finish() for k, v in self.grp_models[grp_col].items(): self.grp_models[grp_col][k] = load_obj( v) if v is not None else None remove(v) self._clean_tmp_folder(logger, tmp_folder) return self
def transform(self, X: dt.Frame, **kwargs): """ Uses fitted models (1 per time group) to predict the target :param X: Datatable Frame containing the features :return: FB Prophet predictions """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) XX = X[:, self.tgc].to_pandas() XX = XX.replace([None, np.nan], 0) XX.rename(columns={self.time_column: "ds"}, inplace=True) tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] assert len(XX_grp) > 0 num_tasks = len(XX_grp) def processor(out, res): out.append(res) pool_to_use = small_job_pool loggerinfo(logger, "Prophet will use {} workers for transform".format(n_jobs)) pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) XX_paths = [] model_paths = [] nb_groups = len(XX_grp) print("Nb Groups = ", nb_groups) for _i_g, (key, X) in enumerate(XX_grp): # Log where we are in the transformation of the dataset if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "FB Prophet : %d%% of groups transformed" % (100 * (_i_g + 1) // nb_groups)) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) X_path = os.path.join(tmp_folder, "fbprophet_Xt" + str(uuid.uuid4())) # Commented for performance, uncomment for debug # print("prophet - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if grp_hash in self.models: model = self.models[grp_hash] model_path = os.path.join( tmp_folder, "fbprophet_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X, X_path) model_paths.append(model_path) args = (model_path, X_path, self.priors[grp_hash], tmp_folder) kwargs = {} pool.submit_tryget( None, MyParallelProphetTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) else: XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # unseen groups XX.index = X.index save_obj(XX, X_path) XX_paths.append(X_path) pool.finish() XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) self._clean_tmp_folder(logger, tmp_folder) return XX
def fit(self, X: dt.Frame, y: np.array = None, **kwargs): """ Fits FB Prophet models (1 per time group) using historical target values contained in y Model fitting is distributed over a pool of processes and uses file storage to share the data with workers :param X: Datatable frame containing the features :param y: numpy array containing the historical values of the target :return: self """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Convert to pandas XX = X[:, self.tgc].to_pandas() XX = XX.replace([None, np.nan], 0) XX.rename(columns={self.time_column: "ds"}, inplace=True) # Make sure labales are numeric if self.labels is not None: y = LabelEncoder().fit(self.labels).transform(y) XX['y'] = np.array(y) # Set target prior self.nan_value = np.mean(y) # Group the input by TGC (Time group column) excluding the time column itself tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] self.models = {} self.priors = {} # Prepare for multi processing num_tasks = len(XX_grp) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool loggerinfo(logger, "Prophet will use {} workers for fitting".format(n_jobs)) loggerinfo( logger, "Prophet parameters holidays {} / monthly {}".format( self.country_holidays, self.monthly_seasonality)) pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) # Fit 1 FB Prophet model per time group columns nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "FB Prophet : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) X_path = os.path.join(tmp_folder, "fbprophet_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) self.priors[grp_hash] = X['y'].mean() params = { "country_holidays": self.country_holidays, "monthly_seasonality": self.monthly_seasonality } args = (X_path, grp_hash, tmp_folder, params) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_fit_async, args=args, kwargs=kwargs, out=self.models) pool.finish() for k, v in self.models.items(): self.models[k] = load_obj(v) if v is not None else None remove(v) self._clean_tmp_folder(logger, tmp_folder) return self