예제 #1
0
    def _prepare_validation_databunch(self, dataframe):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", UserWarning)
            kwargs_variables = {
                'num_workers': 0
            } if sys.platform == 'win32' else {}
            # kwargs_variables['tfm_y'] = True
            fm = FillMissing(self._categorical_variables,
                             self._continuous_variables)
            fm.add_col = False
            fm(dataframe)
            databunch_half = TabularList.from_df(
                dataframe,
                path=tempfile.NamedTemporaryFile().name,
                cat_names=self._categorical_variables,
                cont_names=self._continuous_variables,
                procs=[Categorify, Normalize]).split_by_idx([
                    i for i in range(int(len(dataframe) / 2))
                ]).label_empty().databunch(**kwargs_variables)

            databunch_second_half = TabularList.from_df(
                dataframe,
                path=tempfile.NamedTemporaryFile().name,
                cat_names=self._categorical_variables,
                cont_names=self._continuous_variables,
                procs=[Categorify, Normalize]).split_by_idx([
                    i for i in range(int(len(dataframe) / 2), len(dataframe))
                ]).label_empty().databunch(**kwargs_variables)

        return databunch_half, databunch_second_half
예제 #2
0
def get_pred_new_data_old_model(
        valid_df: pd.DataFrame,
        path: Path = MODELS_PATH) -> Tuple[Learner, float]:
    """Get a RSMPE score for predictions from the existing best model, with
    new data.

    Input: a pd.DataFrame for the validation data and the path for the model.
    Output: the model ready to save, and the root mean squared percentage error
    for the predicted sales. (If this model is second-best, we'll still want
    to save it to a different file for record-keeping purposes.)
    """
    valid_df = preprocess.preprocess(valid_df)

    # Get the right model to load
    models = [
        file for file in os.listdir(path) if file.startswith('current_best')
    ]
    best_model = sorted(models, reverse=True)[0]
    learn = load_learner(path=path,
                         fname=best_model,
                         test=TabularList.from_df(valid_df, path=path))

    # get log predictions and compare to actual values
    log_preds, _ = learn.get_preds(ds_type=DatasetType.Test)
    valid_preds = np.exp(np.array(log_preds.flatten()))
    valid_reals = valid_df.loc[valid_df.sales != 0, 'sales'].values
    new_rmspe = rmspe(valid_preds, valid_reals)
    return (learn, new_rmspe)
예제 #3
0
    def _preprocess_train(self, X_train, y_train, X_val, y_val, **kwargs):
        from fastai.data_block import FloatList
        from fastai.tabular import TabularList
        from fastai.core import defaults

        X_train = self.preprocess(X_train, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)

        from fastai.tabular import FillMissing, Categorify, Normalize
        self.procs = [FillMissing, Categorify, Normalize]

        if self.problem_type == REGRESSION and self.y_scaler is not None:
            y_train_norm = pd.Series(self.y_scaler.fit_transform(y_train.values.reshape(-1, 1)).reshape(-1))
            y_val_norm = pd.Series(self.y_scaler.transform(y_val.values.reshape(-1, 1)).reshape(-1)) if y_val is not None else None
            logger.log(0, f'Training with scaled targets: {self.y_scaler} - !!! NN training metric will be different from the final results !!!')
        else:
            y_train_norm = y_train
            y_val_norm = y_val

        logger.log(15, f'Using {len(self.cont_columns)} cont features')
        df_train, train_idx, val_idx = self._generate_datasets(X_train, y_train_norm, X_val, y_val_norm)
        label_class = FloatList if self.problem_type == REGRESSION else None

        # additional workers are helping only when fork is enabled; in other mp modes, communication overhead reduces performance
        num_workers = defaults.cpus if is_fork_enabled() else 0

        # Copy cat_columns and cont_columns because TabularList is mutating the list
        data = (TabularList.from_df(df_train, path=self.path,
                                    cat_names=self.cat_columns.copy(), cont_names=self.cont_columns.copy(), procs=self.procs)
                .split_by_idxs(train_idx, val_idx)
                .label_from_df(cols=LABEL, label_cls=label_class)
                .databunch(bs=self.params['bs'] if len(X_train) > self.params['bs'] else 32, num_workers=num_workers))
        return data
예제 #4
0
 def predict(self, dataframe):
     test_data = TabularList.from_df(dataframe, cont_names=self.learner.data.cont_names)
     self.learner.data.add_test(test_data)
     preds, target = self.learner.get_preds(DatasetType.Test)
     preds = pd.Series(map(np.array, preds.numpy()), name='predictions')
     target = pd.Series(target.numpy(), name='target')
     return pd.concat([preds, target], axis='columns')
예제 #5
0
def test_model_save_load(fastai_model, model_path):
    model = fastai_model.model

    mlflow.fastai.save_model(fastai_learner=model, path=model_path)
    reloaded_model = mlflow.fastai.load_model(model_uri=model_path)
    reloaded_pyfunc = pyfunc.load_model(model_uri=model_path)

    # Verify reloaded model computes same predictions as original model
    test_data = TabularList.from_df(fastai_model.inference_dataframe)
    model.data.add_test(test_data)
    reloaded_model.data.add_test(test_data)

    real_preds, real_target = map(lambda output: output.numpy(),
                                  model.get_preds(DatasetType.Test))
    reloaded_preds, reloaded_target = map(
        lambda output: output.numpy(),
        reloaded_model.get_preds(DatasetType.Test))

    np.testing.assert_array_almost_equal(real_preds, reloaded_preds)
    np.testing.assert_array_almost_equal(real_target, reloaded_target)

    model_wrapper = mlflow.fastai._FastaiModelWrapper(model)
    reloaded_model_wrapper = mlflow.fastai._FastaiModelWrapper(reloaded_model)

    model_result = model_wrapper.predict(fastai_model.inference_dataframe)
    reloaded_result = reloaded_model_wrapper.predict(
        fastai_model.inference_dataframe)
    pyfunc_result = reloaded_pyfunc.predict(fastai_model.inference_dataframe)

    compare_wrapper_results(model_result, reloaded_result)
    compare_wrapper_results(reloaded_result, pyfunc_result)
예제 #6
0
    def _predict_proba(self, X, **kwargs):
        from fastai.basic_data import DatasetType
        from fastai.tabular import TabularList
        from fastai.utils.mod_display import progress_disabled_ctx

        X = self.preprocess(X, **kwargs)

        single_row = len(X) == 1
        # fastai has issues predicting on a single row, duplicating the row as a workaround
        if single_row:
            X = pd.concat([X, X]).reset_index(drop=True)

        # Copy cat_columns and cont_columns because TabularList is mutating the list
        self.model.data.add_test(
            TabularList.from_df(X,
                                cat_names=self.cat_columns.copy(),
                                cont_names=self.cont_columns.copy(),
                                procs=self.procs))
        with progress_disabled_ctx(self.model) as model:
            preds, _ = model.get_preds(ds_type=DatasetType.Test)
        if single_row:
            preds = preds[:1, :]
        if self.problem_type == REGRESSION:
            if self.y_scaler is not None:
                return self.y_scaler.inverse_transform(
                    preds.numpy()).reshape(-1)
            else:
                return preds.numpy().reshape(-1)
        if self.problem_type == BINARY:
            return preds[:, 1].numpy()
        else:
            return preds.numpy()
예제 #7
0
    def predict_proba(self, X, preprocess=True):
        from fastai.basic_data import DatasetType
        from fastai.tabular import TabularList
        from fastai.utils.mod_display import progress_disabled_ctx
        from fastai.tabular import FillMissing, Categorify, Normalize

        if preprocess:
            X = self.preprocess(X)
        procs = [FillMissing, Categorify, Normalize]
        self.model.data.add_test(
            TabularList.from_df(X,
                                cat_names=self.cat_columns,
                                cont_names=self.cont_columns,
                                procs=procs))
        with progress_disabled_ctx(self.model) as model:
            preds, _ = model.get_preds(ds_type=DatasetType.Test)
        if self.problem_type == REGRESSION:
            if self.y_scaler is not None:
                return self.y_scaler.inverse_transform(
                    preds.numpy()).reshape(-1)
            else:
                return preds.numpy().reshape(-1)
        if self.problem_type == BINARY:
            return preds[:, 1].numpy()
        else:
            return preds.numpy()
예제 #8
0
def iris_data():
    iris = datasets.load_iris()
    X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2])
    y = pd.Series(iris.target, name="label")
    return (TabularList.from_df(
        pd.concat([X, y], axis=1),
        cont_names=list(X.columns)).split_by_rand_pct(
            valid_pct=0.1, seed=42).label_from_df(cols="label").databunch())
예제 #9
0
파일: models.py 프로젝트: wangvei/cryspnet
 def load(self, ext_magpie:pd.DataFrame, **db_kwargs):
     # adopted from fastai.load_learner
     self.src.add_test(TabularList.from_df(ext_magpie), tfm_y=None,)
     data = self.src.databunch(bs=self.batch_size, **db_kwargs)
     res = self.clas_func(data, self.model, **self.state)
     res.callback_fns = self.callback_fns #to avoid duplicates
     res.callbacks = [load_callback(c,s, res) for c,s in self.cb_state.items()]
     self.learn = res
     return self
예제 #10
0
    def predict(self, dataframe):
        from fastai.tabular import TabularList
        from fastai.basic_data import DatasetType

        test_data = TabularList.from_df(dataframe, cont_names=self.learner.data.cont_names)
        self.learner.data.add_test(test_data)
        preds, target = self.learner.get_preds(DatasetType.Test)
        preds = pd.Series(map(np.array, preds.numpy()), name="predictions")
        target = pd.Series(target.numpy(), name="target")
        return pd.concat([preds, target], axis="columns")
예제 #11
0
def fastai_model():
    iris = datasets.load_iris()
    X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2])
    y = pd.Series(iris.target, name="label")
    data = (TabularList.from_df(
        pd.concat([X, y], axis=1),
        cont_names=list(X.columns)).split_by_rand_pct(
            valid_pct=0.1, seed=42).label_from_df(cols="label").databunch())
    model = tabular_learner(data, metrics=accuracy, layers=[3])
    model.fit(1)
    return ModelWithData(model=model, inference_dataframe=X)
예제 #12
0
    def preprocess_train(self, X_train, y_train, X_val, y_val, **kwargs):
        from fastai.data_block import FloatList
        from fastai.tabular import TabularList
        from fastai.core import defaults

        self.cat_columns = self.feature_metadata.get_features(valid_raw_types=[R_OBJECT, R_CATEGORY, R_BOOL])
        self.cont_columns = self.feature_metadata.get_features(valid_raw_types=[R_INT, R_FLOAT, R_DATETIME])

        if self.problem_type == REGRESSION and self.y_scaler is not None:
            y_train_norm = pd.Series(self.y_scaler.fit_transform(y_train.values.reshape(-1, 1)).reshape(-1))
            y_val_norm = pd.Series(self.y_scaler.transform(y_val.values.reshape(-1, 1)).reshape(-1)) if y_val is not None else None
            logger.log(0, f'Training with scaled targets: {self.y_scaler} - !!! NN training metric will be different from the final results !!!')
        else:
            y_train_norm = y_train
            y_val_norm = y_val
        try:
            X_train_stats = X_train.describe(include='all').T.reset_index()
            cat_cols_to_drop = X_train_stats[(X_train_stats['unique'] > self.params.get('max_unique_categorical_values', 10000)) | (X_train_stats['unique'].isna())]['index'].values
        except:
            cat_cols_to_drop = []
        cat_cols_to_keep = [col for col in X_train.columns.values if (col not in cat_cols_to_drop)]
        cat_cols_to_use = [col for col in self.cat_columns if col in cat_cols_to_keep]
        logger.log(15, f'Using {len(cat_cols_to_use)}/{len(self.cat_columns)} categorical features')
        self.cat_columns = cat_cols_to_use
        self.cat_columns = [feature for feature in self.cat_columns if feature in list(X_train.columns)]
        self.cont_columns = [feature for feature in self.cont_columns if feature in list(X_train.columns)]

        for c in self.cat_columns:
            self.columns_fills[c] = MISSING
        for c in self.cont_columns:
            self.columns_fills[c] = X_train[c].mean()

        X_train = self.fill_missing(X_train)

        logger.log(15, f'Using {len(self.cont_columns)} cont features')
        X_train = self.fold_preprocess(X_train, fit=True)
        if X_val is not None:
            X_val = self.fill_missing(X_val)
            X_val = self.fold_preprocess(X_val)
        df_train, train_idx, val_idx = self._generate_datasets(X_train, y_train_norm, X_val, y_val_norm)
        label_class = FloatList if self.problem_type == REGRESSION else None

        # additional workers are helping only when fork is enabled; in other mp modes, communication overhead reduces performance
        num_workers = defaults.cpus if is_fork_enabled() else 0

        # Copy cat_columns and cont_columns because TabularList is mutating the list
        data = (TabularList.from_df(df_train, path=self.path,
                                    cat_names=self.cat_columns.copy(), cont_names=self.cont_columns.copy(), procs=self.procs)
                .split_by_idxs(train_idx, val_idx)
                .label_from_df(cols=LABEL, label_cls=label_class)
                .databunch(bs=self.params['bs'] if len(X_train) > self.params['bs'] else 32, num_workers=num_workers))
        return data
예제 #13
0
def databunch(df: pd.DataFrame,
              dependent_var: str = "resource_template") -> DataBunch:
    # Resource_template is what we're trying to predict
    category_names = ["subject", "group"]
    procedures = [Categorify]
    # All predicates in graph
    continous_names = list(df.keys())[3:]
    # Reserve last 20% of Data Frame for validation
    total = len(df)
    last_start = total - int(total * 0.2)
    test = TabularList.from_df(
        df[last_start:total].copy(),
        cat_names=category_names,
        cont_names=continous_names,
    )
    return (TabularList.from_df(
        df,
        cat_names=category_names,
        cont_names=continous_names,
        procs=procedures).split_by_idx(list(range(
            last_start, total))).label_from_df(
                cols=dependent_var).add_test(test).databunch())
예제 #14
0
    def preprocess_train(self, X_train, y_train, X_val, y_val, **kwargs):
        from fastai.data_block import FloatList
        from fastai.tabular import TabularList
        from fastai.tabular import FillMissing, Categorify, Normalize
        from fastai.core import defaults

        self.cat_columns = X_train.select_dtypes([
            'category', 'object', 'bool', 'bool_'
        ]).columns.values.tolist()

        self.cont_columns = X_train.select_dtypes([
            'float', 'float_', 'float16', 'float32', 'float64',
            'int', 'int_', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64',
            'datetime'
        ]).columns.values.tolist()

        if self.problem_type == REGRESSION and self.y_scaler is not None:
            y_train_norm = pd.Series(self.y_scaler.fit_transform(y_train.values.reshape(-1, 1)).reshape(-1))
            y_val_norm = pd.Series(self.y_scaler.transform(y_val.values.reshape(-1, 1)).reshape(-1)) if y_val is not None else None
            logger.log(0, f'Training with scaled targets: {self.y_scaler} - !!! NN training metric will be different from the final results !!!')
        else:
            y_train_norm = y_train
            y_val_norm = y_val
        try:
            X_train_stats = X_train.describe(include='all').T.reset_index()
            cat_cols_to_drop = X_train_stats[(X_train_stats['unique'] > self.params.get('max_unique_categorical_values', 10000)) | (X_train_stats['unique'].isna())]['index'].values
        except:
            cat_cols_to_drop = []
        cat_cols_to_keep = [col for col in X_train.columns.values if (col not in cat_cols_to_drop)]
        cat_cols_to_use = [col for col in self.cat_columns if col in cat_cols_to_keep]
        logger.log(15, f'Using {len(cat_cols_to_use)}/{len(self.cat_columns)} categorical features')
        self.cat_columns = cat_cols_to_use
        self.cat_columns = [feature for feature in self.cat_columns if feature in list(X_train.columns)]
        self.cont_columns = [feature for feature in self.cont_columns if feature in list(X_train.columns)]
        logger.log(15, f'Using {len(self.cont_columns)} cont features')
        X_train = self.fold_preprocess(X_train, fit=True)
        if X_val is not None:
            X_val = self.fold_preprocess(X_val)
        df_train, train_idx, val_idx = self._generate_datasets(X_train, y_train_norm, X_val, y_val_norm)
        label_class = FloatList if self.problem_type == REGRESSION else None
        procs = [FillMissing, Categorify, Normalize]

        # additional workers are helping only when fork is enabled; in other mp modes, communication overhead reduces performance
        num_workers = defaults.cpus if is_fork_enabled() else 0
        data = (TabularList.from_df(df_train, path=self.path, cat_names=self.cat_columns, cont_names=self.cont_columns, procs=procs)
                .split_by_idxs(train_idx, val_idx)
                .label_from_df(cols=LABEL, label_cls=label_class)
                .databunch(bs=self.params['bs'] if len(X_train) > self.params['bs'] else 32, num_workers=num_workers))
        return data
예제 #15
0
    def _preprocess_train(self, X, y, X_val, y_val, num_workers):
        from fastai.data_block import FloatList
        from fastai.tabular import TabularList

        X = self.preprocess(X, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)

        from fastai.tabular import FillMissing, Categorify, Normalize
        self.procs = [FillMissing, Categorify, Normalize]

        if self.problem_type == REGRESSION and self.y_scaler is not None:
            y_norm = pd.Series(
                self.y_scaler.fit_transform(y.values.reshape(-1,
                                                             1)).reshape(-1))
            y_val_norm = pd.Series(
                self.y_scaler.transform(y_val.values.reshape(
                    -1, 1)).reshape(-1)) if y_val is not None else None
            logger.log(
                0,
                f'Training with scaled targets: {self.y_scaler} - !!! NN training metric will be different from the final results !!!'
            )
        else:
            y_norm = y
            y_val_norm = y_val

        logger.log(15, f'Using {len(self.cont_columns)} cont features')
        df_train, train_idx, val_idx = self._generate_datasets(
            X, y_norm, X_val, y_val_norm)
        label_class = FloatList if self.problem_type == REGRESSION else None

        # Copy cat_columns and cont_columns because TabularList is mutating the list
        data = (TabularList.from_df(
            df_train,
            path=self.path,
            cat_names=self.cat_columns.copy(),
            cont_names=self.cont_columns.copy(),
            procs=self.procs).split_by_idxs(train_idx, val_idx).label_from_df(
                cols=LABEL, label_cls=label_class).databunch(
                    bs=self.params['bs'] if len(X) > self.params['bs'] else 32,
                    num_workers=num_workers))
        return data
예제 #16
0
    def preprocess_train(self, X_train, Y_train, X_test, Y_test, **kwargs):
        from fastai.data_block import FloatList
        from fastai.tabular import TabularList
        from fastai.tabular import FillMissing, Categorify, Normalize

        self.cat_columns = X_train.select_dtypes(['category', 'object'
                                                  ]).columns.values.tolist()
        self.cont_columns = X_train.select_dtypes(['float', 'int', 'datetime'
                                                   ]).columns.values.tolist()
        if self.problem_type == REGRESSION and self.y_scaler is not None:
            Y_train_norm = pd.Series(
                self.y_scaler.fit_transform(Y_train.values.reshape(
                    -1, 1)).reshape(-1))
            Y_test_norm = pd.Series(
                self.y_scaler.transform(Y_test.values.reshape(
                    -1, 1)).reshape(-1)) if Y_test is not None else None
            logger.log(
                0,
                f'Training with scaled targets: {self.y_scaler} - !!! NN training metric will be different from the final results !!!'
            )
        else:
            Y_train_norm = Y_train
            Y_test_norm = Y_test
        try:
            X_train_stats = X_train.describe(include='all').T.reset_index()
            cat_cols_to_drop = X_train_stats[
                (X_train_stats['unique'] > self.params.
                 get('max_unique_categorical_values', 10000)) |
                (X_train_stats['unique'].isna())]['index'].values
        except:
            cat_cols_to_drop = []
        cat_cols_to_keep = [
            col for col in X_train.columns.values
            if (col not in cat_cols_to_drop)
        ]
        cat_cols_to_use = [
            col for col in self.cat_columns if col in cat_cols_to_keep
        ]
        logger.log(
            15,
            f'Using {len(cat_cols_to_use)}/{len(self.cat_columns)} categorical features'
        )
        self.cat_columns = cat_cols_to_use
        self.cat_columns = [
            feature for feature in self.cat_columns
            if feature in list(X_train.columns)
        ]
        self.cont_columns = [
            feature for feature in self.cont_columns
            if feature in list(X_train.columns)
        ]
        logger.log(15, f'Using {len(self.cont_columns)} cont features')
        X_train = self.fold_preprocess(X_train, fit=True)
        if X_test is not None:
            X_test = self.fold_preprocess(X_test)
        df_train, train_idx, val_idx = self._generate_datasets(
            X_train, Y_train_norm, X_test, Y_test_norm)
        label_class = FloatList if self.problem_type == REGRESSION else None
        procs = [FillMissing, Categorify, Normalize]
        data = (TabularList.from_df(
            df_train,
            path=self.path,
            cat_names=self.cat_columns,
            cont_names=self.cont_columns,
            procs=procs).split_by_idxs(train_idx, val_idx).label_from_df(
                cols=LABEL, label_cls=label_class).databunch(
                    bs=self.
                    params['bs'] if len(X_train) > self.params['bs'] else 32))
        return data
예제 #17
0
def get_new_model_and_pred(train: pd.DataFrame,
                           valid: pd.DataFrame,
                           path: Path = MODELS_PATH) -> Tuple[Learner, float]:
    """Take new train and validation dataframes, re-run the model, and return
    the model and its root mean squared percentage error.

    Input: the train dataframe, the validation dataframe, and the path for the
    models to be saved.
    Output: the model (ready to save if better than the old one) and its rmspe.
    """

    # Sort the train/valid sets and stick em together
    train.sort_index(inplace=True)
    valid.sort_index(inplace=True)
    df = train.append(valid).copy()

    # We'll need to know how many items in our validation set later
    n_valid = len(valid[valid.sales != 0])

    # Preprocessing
    df = preprocess.preprocess(df)
    inner_args = preprocess.gather_args(df)

    # Create a databunch by starting with a TabularList and applying the usual
    # transformations
    data = (TabularList.from_df(df,
                                path=path,
                                cat_names=inner_args['cat_names'],
                                cont_names=inner_args['cont_names'],
                                procs=inner_args['procs']))

    n_items = len(data.items)

    # Since we sorted by index and appended, our validation set is just the
    # n_valid highest items in our list
    data = data.split_by_valid_func(lambda i: i >= n_items - n_valid)
    data = data.label_from_df(cols=inner_args['dep_var'],
                              label_cls=FloatList,
                              log=True)
    data = data.databunch()

    # Create a learner
    # Let's construct the learner from scratch here, in case we want to change
    # the architecture later (we can and should - this is very basic)
    learn = tabular_learner(
        data,
        layers=[100, 100],
        ps=[0.001, 0.01],
        emb_drop=0.01,
        metrics=exp_rmspe,
        y_range=None,
        callback_fns=[
            partial(callbacks.tracker.TrackerCallback, monitor='exp_rmspe'),
            partial(callbacks.tracker.EarlyStoppingCallback,
                    mode='min',
                    monitor='exp_rmspe',
                    min_delta=0.01,
                    patience=0),
            partial(callbacks.tracker.SaveModelCallback,
                    monitor='exp_rmspe',
                    mode='min',
                    every='improvement',
                    name=datetime.now().strftime("%Y-%m-%d-%X"))
        ])

    # Since repeated model runs showed us that 1e-3 was a good maximum learning
    # rate for this model and since we're doing a no-human-intervention run,
    # we'll use 1e-3 for this model. While this model is in place, we can run
    # some offline tests as needed to see whether the maximum learning rate
    # should be changed, but in most cases the 1e-3 is probably good, even if
    # the model changes (again, we can test offline and update if needed).

    # Also, since we have the early-stopping callback with the save-model
    # callback set to 'every=improvement', we'll run 10 cycles even though we
    # probably won't need nearly that many
    learn.fit_one_cycle(cyc_len=10, max_lr=1e-3)

    # Get our predictions from the model and calculate rmspe
    log_preds, log_reals = learn.get_preds(ds_type=DatasetType.Valid)
    preds = np.exp(log_preds).flatten()
    reals = np.exp(log_reals)
    new_rmspe = rmspe(preds, reals)
    return (learn, new_rmspe)