def _prepare_validation_databunch(self, dataframe): with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) kwargs_variables = { 'num_workers': 0 } if sys.platform == 'win32' else {} # kwargs_variables['tfm_y'] = True fm = FillMissing(self._categorical_variables, self._continuous_variables) fm.add_col = False fm(dataframe) databunch_half = TabularList.from_df( dataframe, path=tempfile.NamedTemporaryFile().name, cat_names=self._categorical_variables, cont_names=self._continuous_variables, procs=[Categorify, Normalize]).split_by_idx([ i for i in range(int(len(dataframe) / 2)) ]).label_empty().databunch(**kwargs_variables) databunch_second_half = TabularList.from_df( dataframe, path=tempfile.NamedTemporaryFile().name, cat_names=self._categorical_variables, cont_names=self._continuous_variables, procs=[Categorify, Normalize]).split_by_idx([ i for i in range(int(len(dataframe) / 2), len(dataframe)) ]).label_empty().databunch(**kwargs_variables) return databunch_half, databunch_second_half
def get_pred_new_data_old_model( valid_df: pd.DataFrame, path: Path = MODELS_PATH) -> Tuple[Learner, float]: """Get a RSMPE score for predictions from the existing best model, with new data. Input: a pd.DataFrame for the validation data and the path for the model. Output: the model ready to save, and the root mean squared percentage error for the predicted sales. (If this model is second-best, we'll still want to save it to a different file for record-keeping purposes.) """ valid_df = preprocess.preprocess(valid_df) # Get the right model to load models = [ file for file in os.listdir(path) if file.startswith('current_best') ] best_model = sorted(models, reverse=True)[0] learn = load_learner(path=path, fname=best_model, test=TabularList.from_df(valid_df, path=path)) # get log predictions and compare to actual values log_preds, _ = learn.get_preds(ds_type=DatasetType.Test) valid_preds = np.exp(np.array(log_preds.flatten())) valid_reals = valid_df.loc[valid_df.sales != 0, 'sales'].values new_rmspe = rmspe(valid_preds, valid_reals) return (learn, new_rmspe)
def _preprocess_train(self, X_train, y_train, X_val, y_val, **kwargs): from fastai.data_block import FloatList from fastai.tabular import TabularList from fastai.core import defaults X_train = self.preprocess(X_train, fit=True) if X_val is not None: X_val = self.preprocess(X_val) from fastai.tabular import FillMissing, Categorify, Normalize self.procs = [FillMissing, Categorify, Normalize] if self.problem_type == REGRESSION and self.y_scaler is not None: y_train_norm = pd.Series(self.y_scaler.fit_transform(y_train.values.reshape(-1, 1)).reshape(-1)) y_val_norm = pd.Series(self.y_scaler.transform(y_val.values.reshape(-1, 1)).reshape(-1)) if y_val is not None else None logger.log(0, f'Training with scaled targets: {self.y_scaler} - !!! NN training metric will be different from the final results !!!') else: y_train_norm = y_train y_val_norm = y_val logger.log(15, f'Using {len(self.cont_columns)} cont features') df_train, train_idx, val_idx = self._generate_datasets(X_train, y_train_norm, X_val, y_val_norm) label_class = FloatList if self.problem_type == REGRESSION else None # additional workers are helping only when fork is enabled; in other mp modes, communication overhead reduces performance num_workers = defaults.cpus if is_fork_enabled() else 0 # Copy cat_columns and cont_columns because TabularList is mutating the list data = (TabularList.from_df(df_train, path=self.path, cat_names=self.cat_columns.copy(), cont_names=self.cont_columns.copy(), procs=self.procs) .split_by_idxs(train_idx, val_idx) .label_from_df(cols=LABEL, label_cls=label_class) .databunch(bs=self.params['bs'] if len(X_train) > self.params['bs'] else 32, num_workers=num_workers)) return data
def predict(self, dataframe): test_data = TabularList.from_df(dataframe, cont_names=self.learner.data.cont_names) self.learner.data.add_test(test_data) preds, target = self.learner.get_preds(DatasetType.Test) preds = pd.Series(map(np.array, preds.numpy()), name='predictions') target = pd.Series(target.numpy(), name='target') return pd.concat([preds, target], axis='columns')
def test_model_save_load(fastai_model, model_path): model = fastai_model.model mlflow.fastai.save_model(fastai_learner=model, path=model_path) reloaded_model = mlflow.fastai.load_model(model_uri=model_path) reloaded_pyfunc = pyfunc.load_model(model_uri=model_path) # Verify reloaded model computes same predictions as original model test_data = TabularList.from_df(fastai_model.inference_dataframe) model.data.add_test(test_data) reloaded_model.data.add_test(test_data) real_preds, real_target = map(lambda output: output.numpy(), model.get_preds(DatasetType.Test)) reloaded_preds, reloaded_target = map( lambda output: output.numpy(), reloaded_model.get_preds(DatasetType.Test)) np.testing.assert_array_almost_equal(real_preds, reloaded_preds) np.testing.assert_array_almost_equal(real_target, reloaded_target) model_wrapper = mlflow.fastai._FastaiModelWrapper(model) reloaded_model_wrapper = mlflow.fastai._FastaiModelWrapper(reloaded_model) model_result = model_wrapper.predict(fastai_model.inference_dataframe) reloaded_result = reloaded_model_wrapper.predict( fastai_model.inference_dataframe) pyfunc_result = reloaded_pyfunc.predict(fastai_model.inference_dataframe) compare_wrapper_results(model_result, reloaded_result) compare_wrapper_results(reloaded_result, pyfunc_result)
def _predict_proba(self, X, **kwargs): from fastai.basic_data import DatasetType from fastai.tabular import TabularList from fastai.utils.mod_display import progress_disabled_ctx X = self.preprocess(X, **kwargs) single_row = len(X) == 1 # fastai has issues predicting on a single row, duplicating the row as a workaround if single_row: X = pd.concat([X, X]).reset_index(drop=True) # Copy cat_columns and cont_columns because TabularList is mutating the list self.model.data.add_test( TabularList.from_df(X, cat_names=self.cat_columns.copy(), cont_names=self.cont_columns.copy(), procs=self.procs)) with progress_disabled_ctx(self.model) as model: preds, _ = model.get_preds(ds_type=DatasetType.Test) if single_row: preds = preds[:1, :] if self.problem_type == REGRESSION: if self.y_scaler is not None: return self.y_scaler.inverse_transform( preds.numpy()).reshape(-1) else: return preds.numpy().reshape(-1) if self.problem_type == BINARY: return preds[:, 1].numpy() else: return preds.numpy()
def predict_proba(self, X, preprocess=True): from fastai.basic_data import DatasetType from fastai.tabular import TabularList from fastai.utils.mod_display import progress_disabled_ctx from fastai.tabular import FillMissing, Categorify, Normalize if preprocess: X = self.preprocess(X) procs = [FillMissing, Categorify, Normalize] self.model.data.add_test( TabularList.from_df(X, cat_names=self.cat_columns, cont_names=self.cont_columns, procs=procs)) with progress_disabled_ctx(self.model) as model: preds, _ = model.get_preds(ds_type=DatasetType.Test) if self.problem_type == REGRESSION: if self.y_scaler is not None: return self.y_scaler.inverse_transform( preds.numpy()).reshape(-1) else: return preds.numpy().reshape(-1) if self.problem_type == BINARY: return preds[:, 1].numpy() else: return preds.numpy()
def iris_data(): iris = datasets.load_iris() X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2]) y = pd.Series(iris.target, name="label") return (TabularList.from_df( pd.concat([X, y], axis=1), cont_names=list(X.columns)).split_by_rand_pct( valid_pct=0.1, seed=42).label_from_df(cols="label").databunch())
def load(self, ext_magpie:pd.DataFrame, **db_kwargs): # adopted from fastai.load_learner self.src.add_test(TabularList.from_df(ext_magpie), tfm_y=None,) data = self.src.databunch(bs=self.batch_size, **db_kwargs) res = self.clas_func(data, self.model, **self.state) res.callback_fns = self.callback_fns #to avoid duplicates res.callbacks = [load_callback(c,s, res) for c,s in self.cb_state.items()] self.learn = res return self
def predict(self, dataframe): from fastai.tabular import TabularList from fastai.basic_data import DatasetType test_data = TabularList.from_df(dataframe, cont_names=self.learner.data.cont_names) self.learner.data.add_test(test_data) preds, target = self.learner.get_preds(DatasetType.Test) preds = pd.Series(map(np.array, preds.numpy()), name="predictions") target = pd.Series(target.numpy(), name="target") return pd.concat([preds, target], axis="columns")
def fastai_model(): iris = datasets.load_iris() X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2]) y = pd.Series(iris.target, name="label") data = (TabularList.from_df( pd.concat([X, y], axis=1), cont_names=list(X.columns)).split_by_rand_pct( valid_pct=0.1, seed=42).label_from_df(cols="label").databunch()) model = tabular_learner(data, metrics=accuracy, layers=[3]) model.fit(1) return ModelWithData(model=model, inference_dataframe=X)
def preprocess_train(self, X_train, y_train, X_val, y_val, **kwargs): from fastai.data_block import FloatList from fastai.tabular import TabularList from fastai.core import defaults self.cat_columns = self.feature_metadata.get_features(valid_raw_types=[R_OBJECT, R_CATEGORY, R_BOOL]) self.cont_columns = self.feature_metadata.get_features(valid_raw_types=[R_INT, R_FLOAT, R_DATETIME]) if self.problem_type == REGRESSION and self.y_scaler is not None: y_train_norm = pd.Series(self.y_scaler.fit_transform(y_train.values.reshape(-1, 1)).reshape(-1)) y_val_norm = pd.Series(self.y_scaler.transform(y_val.values.reshape(-1, 1)).reshape(-1)) if y_val is not None else None logger.log(0, f'Training with scaled targets: {self.y_scaler} - !!! NN training metric will be different from the final results !!!') else: y_train_norm = y_train y_val_norm = y_val try: X_train_stats = X_train.describe(include='all').T.reset_index() cat_cols_to_drop = X_train_stats[(X_train_stats['unique'] > self.params.get('max_unique_categorical_values', 10000)) | (X_train_stats['unique'].isna())]['index'].values except: cat_cols_to_drop = [] cat_cols_to_keep = [col for col in X_train.columns.values if (col not in cat_cols_to_drop)] cat_cols_to_use = [col for col in self.cat_columns if col in cat_cols_to_keep] logger.log(15, f'Using {len(cat_cols_to_use)}/{len(self.cat_columns)} categorical features') self.cat_columns = cat_cols_to_use self.cat_columns = [feature for feature in self.cat_columns if feature in list(X_train.columns)] self.cont_columns = [feature for feature in self.cont_columns if feature in list(X_train.columns)] for c in self.cat_columns: self.columns_fills[c] = MISSING for c in self.cont_columns: self.columns_fills[c] = X_train[c].mean() X_train = self.fill_missing(X_train) logger.log(15, f'Using {len(self.cont_columns)} cont features') X_train = self.fold_preprocess(X_train, fit=True) if X_val is not None: X_val = self.fill_missing(X_val) X_val = self.fold_preprocess(X_val) df_train, train_idx, val_idx = self._generate_datasets(X_train, y_train_norm, X_val, y_val_norm) label_class = FloatList if self.problem_type == REGRESSION else None # additional workers are helping only when fork is enabled; in other mp modes, communication overhead reduces performance num_workers = defaults.cpus if is_fork_enabled() else 0 # Copy cat_columns and cont_columns because TabularList is mutating the list data = (TabularList.from_df(df_train, path=self.path, cat_names=self.cat_columns.copy(), cont_names=self.cont_columns.copy(), procs=self.procs) .split_by_idxs(train_idx, val_idx) .label_from_df(cols=LABEL, label_cls=label_class) .databunch(bs=self.params['bs'] if len(X_train) > self.params['bs'] else 32, num_workers=num_workers)) return data
def databunch(df: pd.DataFrame, dependent_var: str = "resource_template") -> DataBunch: # Resource_template is what we're trying to predict category_names = ["subject", "group"] procedures = [Categorify] # All predicates in graph continous_names = list(df.keys())[3:] # Reserve last 20% of Data Frame for validation total = len(df) last_start = total - int(total * 0.2) test = TabularList.from_df( df[last_start:total].copy(), cat_names=category_names, cont_names=continous_names, ) return (TabularList.from_df( df, cat_names=category_names, cont_names=continous_names, procs=procedures).split_by_idx(list(range( last_start, total))).label_from_df( cols=dependent_var).add_test(test).databunch())
def preprocess_train(self, X_train, y_train, X_val, y_val, **kwargs): from fastai.data_block import FloatList from fastai.tabular import TabularList from fastai.tabular import FillMissing, Categorify, Normalize from fastai.core import defaults self.cat_columns = X_train.select_dtypes([ 'category', 'object', 'bool', 'bool_' ]).columns.values.tolist() self.cont_columns = X_train.select_dtypes([ 'float', 'float_', 'float16', 'float32', 'float64', 'int', 'int_', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', 'datetime' ]).columns.values.tolist() if self.problem_type == REGRESSION and self.y_scaler is not None: y_train_norm = pd.Series(self.y_scaler.fit_transform(y_train.values.reshape(-1, 1)).reshape(-1)) y_val_norm = pd.Series(self.y_scaler.transform(y_val.values.reshape(-1, 1)).reshape(-1)) if y_val is not None else None logger.log(0, f'Training with scaled targets: {self.y_scaler} - !!! NN training metric will be different from the final results !!!') else: y_train_norm = y_train y_val_norm = y_val try: X_train_stats = X_train.describe(include='all').T.reset_index() cat_cols_to_drop = X_train_stats[(X_train_stats['unique'] > self.params.get('max_unique_categorical_values', 10000)) | (X_train_stats['unique'].isna())]['index'].values except: cat_cols_to_drop = [] cat_cols_to_keep = [col for col in X_train.columns.values if (col not in cat_cols_to_drop)] cat_cols_to_use = [col for col in self.cat_columns if col in cat_cols_to_keep] logger.log(15, f'Using {len(cat_cols_to_use)}/{len(self.cat_columns)} categorical features') self.cat_columns = cat_cols_to_use self.cat_columns = [feature for feature in self.cat_columns if feature in list(X_train.columns)] self.cont_columns = [feature for feature in self.cont_columns if feature in list(X_train.columns)] logger.log(15, f'Using {len(self.cont_columns)} cont features') X_train = self.fold_preprocess(X_train, fit=True) if X_val is not None: X_val = self.fold_preprocess(X_val) df_train, train_idx, val_idx = self._generate_datasets(X_train, y_train_norm, X_val, y_val_norm) label_class = FloatList if self.problem_type == REGRESSION else None procs = [FillMissing, Categorify, Normalize] # additional workers are helping only when fork is enabled; in other mp modes, communication overhead reduces performance num_workers = defaults.cpus if is_fork_enabled() else 0 data = (TabularList.from_df(df_train, path=self.path, cat_names=self.cat_columns, cont_names=self.cont_columns, procs=procs) .split_by_idxs(train_idx, val_idx) .label_from_df(cols=LABEL, label_cls=label_class) .databunch(bs=self.params['bs'] if len(X_train) > self.params['bs'] else 32, num_workers=num_workers)) return data
def _preprocess_train(self, X, y, X_val, y_val, num_workers): from fastai.data_block import FloatList from fastai.tabular import TabularList X = self.preprocess(X, fit=True) if X_val is not None: X_val = self.preprocess(X_val) from fastai.tabular import FillMissing, Categorify, Normalize self.procs = [FillMissing, Categorify, Normalize] if self.problem_type == REGRESSION and self.y_scaler is not None: y_norm = pd.Series( self.y_scaler.fit_transform(y.values.reshape(-1, 1)).reshape(-1)) y_val_norm = pd.Series( self.y_scaler.transform(y_val.values.reshape( -1, 1)).reshape(-1)) if y_val is not None else None logger.log( 0, f'Training with scaled targets: {self.y_scaler} - !!! NN training metric will be different from the final results !!!' ) else: y_norm = y y_val_norm = y_val logger.log(15, f'Using {len(self.cont_columns)} cont features') df_train, train_idx, val_idx = self._generate_datasets( X, y_norm, X_val, y_val_norm) label_class = FloatList if self.problem_type == REGRESSION else None # Copy cat_columns and cont_columns because TabularList is mutating the list data = (TabularList.from_df( df_train, path=self.path, cat_names=self.cat_columns.copy(), cont_names=self.cont_columns.copy(), procs=self.procs).split_by_idxs(train_idx, val_idx).label_from_df( cols=LABEL, label_cls=label_class).databunch( bs=self.params['bs'] if len(X) > self.params['bs'] else 32, num_workers=num_workers)) return data
def preprocess_train(self, X_train, Y_train, X_test, Y_test, **kwargs): from fastai.data_block import FloatList from fastai.tabular import TabularList from fastai.tabular import FillMissing, Categorify, Normalize self.cat_columns = X_train.select_dtypes(['category', 'object' ]).columns.values.tolist() self.cont_columns = X_train.select_dtypes(['float', 'int', 'datetime' ]).columns.values.tolist() if self.problem_type == REGRESSION and self.y_scaler is not None: Y_train_norm = pd.Series( self.y_scaler.fit_transform(Y_train.values.reshape( -1, 1)).reshape(-1)) Y_test_norm = pd.Series( self.y_scaler.transform(Y_test.values.reshape( -1, 1)).reshape(-1)) if Y_test is not None else None logger.log( 0, f'Training with scaled targets: {self.y_scaler} - !!! NN training metric will be different from the final results !!!' ) else: Y_train_norm = Y_train Y_test_norm = Y_test try: X_train_stats = X_train.describe(include='all').T.reset_index() cat_cols_to_drop = X_train_stats[ (X_train_stats['unique'] > self.params. get('max_unique_categorical_values', 10000)) | (X_train_stats['unique'].isna())]['index'].values except: cat_cols_to_drop = [] cat_cols_to_keep = [ col for col in X_train.columns.values if (col not in cat_cols_to_drop) ] cat_cols_to_use = [ col for col in self.cat_columns if col in cat_cols_to_keep ] logger.log( 15, f'Using {len(cat_cols_to_use)}/{len(self.cat_columns)} categorical features' ) self.cat_columns = cat_cols_to_use self.cat_columns = [ feature for feature in self.cat_columns if feature in list(X_train.columns) ] self.cont_columns = [ feature for feature in self.cont_columns if feature in list(X_train.columns) ] logger.log(15, f'Using {len(self.cont_columns)} cont features') X_train = self.fold_preprocess(X_train, fit=True) if X_test is not None: X_test = self.fold_preprocess(X_test) df_train, train_idx, val_idx = self._generate_datasets( X_train, Y_train_norm, X_test, Y_test_norm) label_class = FloatList if self.problem_type == REGRESSION else None procs = [FillMissing, Categorify, Normalize] data = (TabularList.from_df( df_train, path=self.path, cat_names=self.cat_columns, cont_names=self.cont_columns, procs=procs).split_by_idxs(train_idx, val_idx).label_from_df( cols=LABEL, label_cls=label_class).databunch( bs=self. params['bs'] if len(X_train) > self.params['bs'] else 32)) return data
def get_new_model_and_pred(train: pd.DataFrame, valid: pd.DataFrame, path: Path = MODELS_PATH) -> Tuple[Learner, float]: """Take new train and validation dataframes, re-run the model, and return the model and its root mean squared percentage error. Input: the train dataframe, the validation dataframe, and the path for the models to be saved. Output: the model (ready to save if better than the old one) and its rmspe. """ # Sort the train/valid sets and stick em together train.sort_index(inplace=True) valid.sort_index(inplace=True) df = train.append(valid).copy() # We'll need to know how many items in our validation set later n_valid = len(valid[valid.sales != 0]) # Preprocessing df = preprocess.preprocess(df) inner_args = preprocess.gather_args(df) # Create a databunch by starting with a TabularList and applying the usual # transformations data = (TabularList.from_df(df, path=path, cat_names=inner_args['cat_names'], cont_names=inner_args['cont_names'], procs=inner_args['procs'])) n_items = len(data.items) # Since we sorted by index and appended, our validation set is just the # n_valid highest items in our list data = data.split_by_valid_func(lambda i: i >= n_items - n_valid) data = data.label_from_df(cols=inner_args['dep_var'], label_cls=FloatList, log=True) data = data.databunch() # Create a learner # Let's construct the learner from scratch here, in case we want to change # the architecture later (we can and should - this is very basic) learn = tabular_learner( data, layers=[100, 100], ps=[0.001, 0.01], emb_drop=0.01, metrics=exp_rmspe, y_range=None, callback_fns=[ partial(callbacks.tracker.TrackerCallback, monitor='exp_rmspe'), partial(callbacks.tracker.EarlyStoppingCallback, mode='min', monitor='exp_rmspe', min_delta=0.01, patience=0), partial(callbacks.tracker.SaveModelCallback, monitor='exp_rmspe', mode='min', every='improvement', name=datetime.now().strftime("%Y-%m-%d-%X")) ]) # Since repeated model runs showed us that 1e-3 was a good maximum learning # rate for this model and since we're doing a no-human-intervention run, # we'll use 1e-3 for this model. While this model is in place, we can run # some offline tests as needed to see whether the maximum learning rate # should be changed, but in most cases the 1e-3 is probably good, even if # the model changes (again, we can test offline and update if needed). # Also, since we have the early-stopping callback with the save-model # callback set to 'every=improvement', we'll run 10 cycles even though we # probably won't need nearly that many learn.fit_one_cycle(cyc_len=10, max_lr=1e-3) # Get our predictions from the model and calculate rmspe log_preds, log_reals = learn.get_preds(ds_type=DatasetType.Valid) preds = np.exp(log_preds).flatten() reals = np.exp(log_reals) new_rmspe = rmspe(preds, reals) return (learn, new_rmspe)