def predict(self, X: dt.Frame, **kwargs): """ Uses fitted models (1 per time group) to predict the target :param X: Datatable Frame containing the features :return: FB Prophet predictions """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) if self.tgc is None or not all([x in X.names for x in self.tgc]): loggerdebug(logger, "Return 0 predictions") return np.ones(X.shape[0]) * self.nan_value tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) XX = X[:, self.tgc].to_pandas() XX = XX.replace([None, np.nan], 0) XX.rename(columns={self.time_column: "ds"}, inplace=True) if self.params["growth"] == "logistic": XX["cap"] = self.cap tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] assert len(XX_grp) > 0 num_tasks = len(XX_grp) def processor(out, res): out.append(res) pool_to_use = small_job_pool loggerdebug(logger, "Prophet will use {} workers for transform".format(n_jobs)) pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) XX_paths = [] model_paths = [] nb_groups = len(XX_grp) print("Nb Groups = ", nb_groups) for _i_g, (key, X) in enumerate(XX_grp): # Log where we are in the transformation of the dataset if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "FB Prophet : %d%% of groups transformed" % (100 * (_i_g + 1) // nb_groups)) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) X_path = os.path.join(tmp_folder, "fbprophet_Xt" + str(uuid.uuid4())) # Commented for performance, uncomment for debug # print("prophet - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if grp_hash in self.models: model = self.models[grp_hash] model_path = os.path.join( tmp_folder, "fbprophet_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X, X_path) model_paths.append(model_path) args = (model_path, X_path, self.priors[grp_hash], tmp_folder) kwargs = {} pool.submit_tryget( None, MyParallelProphetTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) else: XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # unseen groups XX.index = X.index save_obj(XX, X_path) XX_paths.append(X_path) pool.finish() XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) self._clean_tmp_folder(logger, tmp_folder) return XX['yhat'].values
def predict(self, X: dt.Frame, **kwargs): """ Uses fitted models (1 per time group) to predict the target :param X: Datatable Frame containing the features :return: FB Prophet predictions """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) if self.tgc is None or not all([x in X.names for x in self.tgc]): loggerdebug(logger, "Return 0 predictions") return np.ones(X.shape[0]) * self.nan_value models, _, _, _ = self.get_model_properties() model = models['avg'] grp_models = models['group'] priors = models['priors'] top_groups = models['topgroups'] scalers = models['skl'] general_scaler = models['gen_scaler'] tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Reduce X to TGC tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) X = X[:, self.tgc].to_pandas() # Fill NaNs or None X = X.replace([None, np.nan], 0) # Change date feature name to match Prophet requirements X.rename(columns={self.time_column: "ds"}, inplace=True) if self.params["growth"] == "logistic": X["cap"] = self.cap # Predict y using unique dates X_time = X[['ds']].groupby('ds').first().reset_index() with suppress_stdout_stderr(): y_avg = model.predict(X_time)[['ds', 'yhat']] # Prophet transforms the date column to datetime so we need to transfrom that to merge back X_time.sort_values('ds', inplace=True) X_time['yhat'] = y_avg['yhat'] X_time.sort_index(inplace=True) # Merge back into original frame on 'ds' # pd.merge wipes the index ... so keep it to provide it again indices = X.index X = pd.merge(left=X, right=X_time[['ds', 'yhat']], on='ds', how='left') X.index = indices # Go through groups and recover the scaled target for knowed groups if len(tgc_wo_time) > 0: X_groups = X.groupby(tgc_wo_time) else: X_groups = [([None], X)] inverted_ys = [] for key, X_grp in X_groups: grp_hash = self.get_hash(key) # Scale target for current group if grp_hash in scalers.keys(): inverted_y = scalers[grp_hash].inverse_transform( X_grp[['yhat']]) else: inverted_y = general_scaler.inverse_transform(X_grp[['yhat']]) # Put back in a DataFrame to keep track of original index inverted_df = pd.DataFrame(inverted_y, columns=['yhat']) inverted_df.index = X_grp.index inverted_ys.append(inverted_df) XX_general = pd.concat(tuple(inverted_ys), axis=0).sort_index() if top_groups: # Go though the groups and predict only top XX_paths = [] model_paths = [] def processor(out, res): out.append(res) num_tasks = len(top_groups) pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) nb_groups = len(X_groups) for _i_g, (key, X_grp) in enumerate(X_groups): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "FB Prophet : %d%% of groups predicted" % (100 * (_i_g + 1) // nb_groups)) # Create dict key to store the min max scaler grp_hash = self.get_hash(key) X_path = os.path.join(tmp_folder, "fbprophet_Xt" + str(uuid.uuid4())) if grp_hash not in top_groups: XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat']) # unseen groups XX.index = X_grp.index save_obj(XX, X_path) XX_paths.append(X_path) continue if grp_models[grp_hash] is None: XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat']) # unseen groups XX.index = X_grp.index save_obj(XX, X_path) XX_paths.append(X_path) continue model = grp_models[grp_hash] model_path = os.path.join( tmp_folder, "fbprophet_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X_grp, X_path) model_paths.append(model_path) args = (model_path, X_path, priors[grp_hash], tmp_folder) kwargs = {} pool.submit_tryget( None, MyParallelProphetTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) pool.finish() XX_top_groups = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) self._clean_tmp_folder(logger, tmp_folder) features_df = pd.DataFrame() features_df['GrpAvg'] = XX_general['yhat'] if top_groups: features_df[f'_Top{self.top_n}Grp'] = XX_top_groups['yhat'] features_df.loc[features_df[f'_Top{self.top_n}Grp'].notnull(), 'GrpAvg'] = features_df.loc[ features_df[f'_Top{self.top_n}Grp'].notnull(), f'_Top{self.top_n}Grp'] # Models have to return a numpy array return features_df['GrpAvg'].values
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): # Get TGC and time column self.tgc = self.params_base.get('tgc', None) self.time_column = self.params_base.get('time_column', None) self.nan_value = np.mean(y) self.cap = np.max( y ) * 1.5 # TODO Don't like this we should compute a cap from average yearly growth self.prior = np.mean(y) if self.time_column is None: self.time_column = self.tgc[0] # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) loggerinfo( logger, "Start Fitting Prophet Model with params : {}".format(self.params)) # Get temporary folders for multi process communication tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Convert to pandas XX = X[:, self.tgc].to_pandas() XX = XX.replace([None, np.nan], 0) XX.rename(columns={self.time_column: "ds"}, inplace=True) # Make target available in the Frame XX['y'] = np.array(y) # Set target prior self.nan_value = np.mean(y) # Group the input by TGC (Time group column) excluding the time column itself tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] self.models = {} self.priors = {} # Prepare for multi processing num_tasks = len(XX_grp) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool loggerdebug(logger, "Prophet will use {} workers for fitting".format(n_jobs)) pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) # Fit 1 FB Prophet model per time group columns nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "FB Prophet : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) X_path = os.path.join(tmp_folder, "fbprophet_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) self.priors[grp_hash] = X['y'].mean() args = (X_path, grp_hash, tmp_folder, self.params, self.cap) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_fit_async, args=args, kwargs=kwargs, out=self.models) pool.finish() for k, v in self.models.items(): self.models[k] = load_obj(v) if v is not None else None remove(v) self._clean_tmp_folder(logger, tmp_folder) return None
def predict(self, X, **kwargs): orig_cols = list(X.names) import pandas as pd import xgboost as xgb import numpy as np def sigmoid(x): z = 1.0 / (1.0 + np.exp(-x)) return z # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) X = dt.Frame(X) X = X.to_pandas() if self.protected in list(X.columns): # Set the protected group to 0 and all others 1 loggerdebug(logger, "Protected test found") protected_test = np.array([ int(item) for item in ~(np.array(X[self.protected]) == self.protected_label) ]) else: loggerdebug(logger, "Protected test not found") protected_test = np.array([]) if self.protected in list(X.columns): X = X.drop(self.protected, axis=1) # Replace missing values with a missing category # Replace categories that weren't in the training set with the mode if len(self.X_categorical) > 0: for colname in self.X_categorical: if colname in list(X.columns): X[colname] = list(X[colname].fillna("Missing")) for label in self.X_categorical: if label in list(X.columns): # Replace anything not in the test set train_categories = self.train_levels[label] X_label = np.array(X[label]) mmode = self.train_mode[label] X_label[~np.isin(X_label, train_categories)] = mmode X[label] = X_label # Replace missing values with a missing value code if len(self.X_numeric) > 0: for colname in self.X_numeric: if colname in list(X.columns): X[colname] = list(X[colname].fillna(-999)) # Get model model, _, _, _ = self.get_model_properties() # Remove the protected group if self.protected in self.X_categorical: self.X_categorical.remove(self.protected) # One hot encode categorical features if len(self.X_categorical) > 0: X_enc = self.enc.transform(X[self.X_categorical]).toarray() X = pd.concat([ X[self.X_numeric], pd.DataFrame(X_enc, columns=self.encoded_categories) ], axis=1) d_test = xgb.DMatrix(X, missing=np.nan) # If the positive target was 0, change the final result to 1-p if self.positive_target == 0: preds = 1.0 - sigmoid(model.predict(d_test)) else: preds = sigmoid(model.predict(d_test)) mean_preds = np.mean(preds) # Set a penalty value to which some probabilities will be changed # if the fairness threshold isn't reached epsilon = 0.0001 if mean_preds > 0.5: penalty = epsilon else: penalty = 1.0 - epsilon # Only apply penalties in the training stage if self.is_train: # If the protected value was removed, use the maximum penalty # by changing all probabilities to the penalty value # (the recipe needs to be able to use the protected values) if self.protected == "none": preds[0:len(preds)] = penalty loggerdata(logger, str(preds)) loggerdata(logger, "Removal_penalty") else: # The mean ratio calculation for target=0 and target=1 if self.positive_target == 0: if np.mean(preds[protected_test == 1]) < 1.0: DI = (1.0 - np.mean(preds[protected_test == 0])) / ( 1.0 - np.mean(preds[protected_test == 1])) else: DI = 1 else: if np.mean(preds[protected_test == 1]) > 0.0: DI = np.mean(preds[protected_test == 0]) / np.mean( preds[protected_test == 1]) else: DI = 1 loggerdata(logger, "Mean ratio Check") loggerdata(logger, str(DI)) if DI < self.mean_protected_prediction_ratio_minimum: # Create a penalty proportional to the distance below the specified threshold len_preds = len(preds) num_penalty = min( len_preds, int((self.mean_protected_prediction_ratio_minimum - DI) / self.mean_protected_prediction_ratio_minimum * len_preds)) preds[0:num_penalty] = penalty loggerdata(logger, "num_penalty1") loggerdata(logger, str(num_penalty), str(num_penalty / len(preds))) self.is_train = False return preds