def load(cls, path: str, reset_paths=True, verbose=True):
        try:
            from autogluon.text.text_prediction.dataset import TabularDataset
            from autogluon.text.text_prediction.models.basic_v1 import BertForTextPredictionBasic
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)

        logger.log(15, f'Load from {path}.')
        obj = super().load(os.path.join(path, cls.model_file_name))
        nn_model = BertForTextPredictionBasic.load(
            os.path.join(path, cls.nn_model_name))
        obj.model = nn_model
        return obj
Пример #2
0
    def load(cls, path: str, reset_paths=True, verbose=True):
        try:
            from autogluon.text.text_prediction.dataset import TabularDataset
            from autogluon.text.text_prediction.models.basic_v1 import BertForTextPredictionBasic
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)

        model = super().load(path=path,
                             reset_paths=reset_paths,
                             verbose=verbose)
        model.model = BertForTextPredictionBasic.load(
            os.path.join(path, cls.nn_model_name))
        return model
    def _build_model(self, X_train, y_train, X_val, y_val, hyperparameters):
        try:
            from autogluon.text.text_prediction.text_prediction \
                import ag_text_prediction_params, merge_params, get_column_properties, \
                infer_problem_type, infer_eval_stop_log_metrics
            from autogluon.text.text_prediction.models.basic_v1 import BertForTextPredictionBasic
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)

        # Decide the name of the label column
        if 'label' in X_train.columns:
            label_col_id = 0
            while True:
                self._label_column_name = 'label{}'.format(label_col_id)
                if self._label_column_name not in X_train.columns:
                    break
                label_col_id += 1
        else:
            self._label_column_name = 'label'
        if X_val is not None:
            concat_feature_df = pd.concat([X_train, X_val])
            concat_feature_df.reset_index(drop=True, inplace=True)
            concat_label_df = pd.DataFrame(
                {self._label_column_name: pd.concat([y_train, y_val])})
            concat_label_df.reset_index(drop=True, inplace=True)
        else:
            concat_feature_df = X_train
            concat_label_df = pd.DataFrame({self._label_column_name: y_train})
        feature_column_properties = get_column_properties(
            df=concat_feature_df,
            metadata=None,
            label_columns=None,
            provided_column_properties=None)

        label_column_property = get_column_properties(
            df=concat_label_df,
            metadata=None,
            label_columns=None,
            provided_column_properties=None)
        column_properties = collections.OrderedDict(
            list(feature_column_properties.items()) +
            list(label_column_property.items()))
        problem_type, label_shape = infer_problem_type(
            column_properties=column_properties,
            label_col_name=self._label_column_name)
        eval_metric, stopping_metric, log_metrics =\
            infer_eval_stop_log_metrics(problem_type,
                                        label_shape=label_shape,
                                        eval_metric=self.eval_metric,
                                        stopping_metric=self.stopping_metric)
        search_space = hyperparameters['models']['BertForTextPredictionBasic'][
            'search_space']
        self.model = BertForTextPredictionBasic(
            column_properties=column_properties,
            feature_columns=list(X_train.columns),
            label_columns=[self._label_column_name],
            problem_types=[problem_type],
            label_shapes=[label_shape],
            stopping_metric=stopping_metric,
            log_metrics=log_metrics,
            output_directory=os.path.join(self.path, self.name),
            logger=logger,
            base_config=None,
            search_space=search_space)
        return column_properties
class TextPredictionV1Model(AbstractModel):
    nn_model_name = 'text_nn'

    def __init__(self, **kwargs):
        """The TextPredictionV1Model.

        The features can be a mix of
        - text column
        - categorical column
        - numerical column

        The labels can be categorical or numerical.

        Parameters
        ----------
        path
            The directory to store the modeling outputs.
        name
            Name of subdirectory inside path where model will be saved.
        problem_type
            Type of problem that this model will handle.
            Valid options: ['binary', 'multiclass', 'regression'].
        eval_metric
            The evaluation metric.
        num_classes
            The number of classes.
        stopping_metric
            The stopping metric.
        model
            The internal model object.
        hyperparameters
            The hyperparameters of the model
        features
            Names of the features.
        feature_metadata
            The feature metadata.
        debug
            Whether to turn on debug mode
        """
        super().__init__(**kwargs)
        self._label_column_name = None
        self._numeric_columns = None
        self._cat_columns = None

    def _preprocess(self, X: pd.DataFrame, fit=False, **kwargs):
        if fit:
            self._numeric_columns = self.feature_metadata.get_features(
                valid_raw_types=[R_INT, R_FLOAT])
            self._cat_columns = self.feature_metadata.get_features(
                valid_raw_types=[R_CATEGORY])
        if self._numeric_columns:
            X[self._numeric_columns] = X[self._numeric_columns].fillna(
                -1)  # FIXME v0.1: Make this more sophisticated, such as mean.
        if self._cat_columns:
            X[self._cat_columns] = X[self._cat_columns].astype(
                'object')  # FIXME v0.1: Avoid this unnecessary conversion.
            # FIXME v0.1: This will crash if NaNs are present at test time.
            # X[self._cat_columns] = X[self._cat_columns].fillna(0)  # FIXME v0.1: Make this more sophisticated. This is not correct.
        return X

    def _build_model(self, X_train, y_train, X_val, y_val, hyperparameters):
        try:
            from autogluon.text.text_prediction.text_prediction \
                import ag_text_prediction_params, merge_params, get_column_properties, \
                infer_problem_type, infer_eval_stop_log_metrics
            from autogluon.text.text_prediction.models.basic_v1 import BertForTextPredictionBasic
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)

        # Decide the name of the label column
        if 'label' in X_train.columns:
            label_col_id = 0
            while True:
                self._label_column_name = 'label{}'.format(label_col_id)
                if self._label_column_name not in X_train.columns:
                    break
                label_col_id += 1
        else:
            self._label_column_name = 'label'
        if X_val is not None:
            concat_feature_df = pd.concat([X_train, X_val])
            concat_feature_df.reset_index(drop=True, inplace=True)
            concat_label_df = pd.DataFrame(
                {self._label_column_name: pd.concat([y_train, y_val])})
            concat_label_df.reset_index(drop=True, inplace=True)
        else:
            concat_feature_df = X_train
            concat_label_df = pd.DataFrame({self._label_column_name: y_train})
        feature_column_properties = get_column_properties(
            df=concat_feature_df,
            metadata=None,
            label_columns=None,
            provided_column_properties=None)

        label_column_property = get_column_properties(
            df=concat_label_df,
            metadata=None,
            label_columns=None,
            provided_column_properties=None)
        column_properties = collections.OrderedDict(
            list(feature_column_properties.items()) +
            list(label_column_property.items()))
        problem_type, label_shape = infer_problem_type(
            column_properties=column_properties,
            label_col_name=self._label_column_name)
        eval_metric, stopping_metric, log_metrics =\
            infer_eval_stop_log_metrics(problem_type,
                                        label_shape=label_shape,
                                        eval_metric=self.eval_metric,
                                        stopping_metric=self.stopping_metric)
        search_space = hyperparameters['models']['BertForTextPredictionBasic'][
            'search_space']
        self.model = BertForTextPredictionBasic(
            column_properties=column_properties,
            feature_columns=list(X_train.columns),
            label_columns=[self._label_column_name],
            problem_types=[problem_type],
            label_shapes=[label_shape],
            stopping_metric=stopping_metric,
            log_metrics=log_metrics,
            output_directory=os.path.join(self.path, self.name),
            logger=logger,
            base_config=None,
            search_space=search_space)
        return column_properties

    def _get_default_auxiliary_params(self) -> dict:
        default_auxiliary_params = super()._get_default_auxiliary_params()
        extra_auxiliary_params = dict(
            get_features_kwargs=dict(
                valid_raw_types=[
                    R_INT,
                    R_FLOAT,
                    # R_CATEGORY,  # FIXME: Add R_CATEGORY features
                    R_OBJECT
                ],
                invalid_special_types=[
                    S_TEXT_NGRAM, S_TEXT_AS_CATEGORY, S_TEXT_SPECIAL
                ],
            ), )
        default_auxiliary_params.update(extra_auxiliary_params)
        return default_auxiliary_params

    @classmethod
    def _get_default_ag_args(cls) -> dict:
        default_ag_args = super()._get_default_ag_args()
        extra_ag_args = {'valid_stacker': False}
        default_ag_args.update(extra_ag_args)
        return default_ag_args

    def _set_default_params(self):
        try:
            from autogluon.text.text_prediction.dataset import TabularDataset
            from autogluon.text.text_prediction.text_prediction import ag_text_prediction_params
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)
        super()._set_default_params()
        self.params = ag_text_prediction_params.create('default_no_hpo')

    def _fit(self,
             X_train: pd.DataFrame,
             y_train: pd.Series,
             X_val: Optional[pd.DataFrame] = None,
             y_val: Optional[pd.Series] = None,
             time_limit: Optional[int] = None,
             **kwargs):
        """The internal fit function

        Parameters
        ----------
        X_train
            Features of the training dataset
        y_train
            Labels of the training dataset
        X_val
            Features of the validation dataset
        y_val
            Labels of the validation dataset
        time_limit
            The time limits for the fit function
        kwargs
            Other keyword arguments

        """
        try:
            import mxnet as mx
            from autogluon.text.text_prediction.dataset import TabularDataset
            from autogluon.text.text_prediction.text_prediction import get_recommended_resource
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)

        # Get arguments from kwargs
        verbosity = kwargs.get('verbosity', 2)
        num_cpus = kwargs.get('num_cpus', None)
        num_gpus = kwargs.get('num_gpus', None)

        # Infer resource
        resource = get_recommended_resource(nthreads_per_trial=num_cpus,
                                            ngpus_per_trial=num_gpus)

        # Set seed
        seed = self.params.get('seed')
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)
            mx.random.seed(seed)

        X_train = self.preprocess(X_train, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)
        else:
            X_val = None
        column_properties = self._build_model(X_train=X_train,
                                              y_train=y_train,
                                              X_val=X_val,
                                              y_val=y_val,
                                              hyperparameters=self.params)
        # Insert the label column
        X_train.insert(len(X_train.columns), self._label_column_name, y_train)
        if X_val is not None:
            X_val.insert(len(X_val.columns), self._label_column_name, y_val)
        scheduler_options = self.params['hpo_params']['scheduler_options']
        search_strategy = self.params['hpo_params']['search_strategy']
        if scheduler_options is None:
            scheduler_options = dict()
        if search_strategy.endswith('hyperband'):
            # Specific defaults for hyperband scheduling
            scheduler_options['reduction_factor'] = scheduler_options.get(
                'reduction_factor', 4)
            scheduler_options['grace_period'] = scheduler_options.get(
                'grace_period', 10)
            scheduler_options['max_t'] = scheduler_options.get('max_t', 50)
        train_data = TabularDataset(X_train,
                                    column_properties=column_properties,
                                    label_columns=self._label_column_name)
        logger.info('Train Dataset:')
        logger.info(train_data)
        if X_val is not None:
            tuning_data = TabularDataset(X_val,
                                         column_properties=column_properties,
                                         label_columns=self._label_column_name)
            logger.info('Tuning Dataset:')
            logger.info(tuning_data)
        else:
            tuning_data = None
        self.model.train(
            train_data=train_data,
            tuning_data=tuning_data,
            resource=resource,
            time_limits=time_limit,
            search_strategy=search_strategy,
            search_options=self.params['hpo_params']['search_options'],
            scheduler_options=scheduler_options,
            num_trials=self.params['hpo_params']['num_trials'],
            console_log=verbosity >= 2,
            ignore_warning=verbosity < 2)

    def save(self, path: str = None, verbose=True) -> str:
        if path is None:
            path = self.path
        model_path = os.path.join(path, self.model_file_name)
        text_nn_path = os.path.join(path, self.nn_model_name)
        logger.log(15, f'Save Model Hyperparams to {model_path}.')
        logger.log(15, f'Save Model Text NN weights to {text_nn_path}')
        model = self.model
        self.model = None
        # save this AbstractModel object without NN weights
        super().save(path=model_path, verbose=verbose)
        model.save(text_nn_path)
        self.model = model
        return path

    def get_memory_size(self) -> int:
        """Return the memory size by calculating the total number of parameters.

        Returns
        -------
        memory_size
            The total memory size in bytes.
        """
        total_size = 0
        for k, v in self.model.net.collect_params().items():
            total_size += np.dtype(v.dtype).itemsize * np.prod(v.shape)
        return total_size

    @classmethod
    def load(cls, path: str, reset_paths=True, verbose=True):
        try:
            from autogluon.text.text_prediction.dataset import TabularDataset
            from autogluon.text.text_prediction.models.basic_v1 import BertForTextPredictionBasic
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)

        logger.log(15, f'Load from {path}.')
        obj = super().load(os.path.join(path, cls.model_file_name))
        nn_model = BertForTextPredictionBasic.load(
            os.path.join(path, cls.nn_model_name))
        obj.model = nn_model
        return obj
Пример #5
0
class TextPredictionV1Model(AbstractModel):
    nn_model_name = 'text_nn'

    def __init__(self, **kwargs):
        """The TextPredictionV1Model.

        The features can be a mix of
        - text column
        - categorical column
        - numerical column

        The labels can be categorical or numerical.

        Parameters
        ----------
        path
            The directory to store the modeling outputs.
        name
            Name of subdirectory inside path where model will be saved.
        problem_type
            Type of problem that this model will handle.
            Valid options: ['binary', 'multiclass', 'regression'].
        eval_metric
            The evaluation metric.
        num_classes
            The number of classes.
        stopping_metric
            The stopping metric.
        model
            The internal model object.
        hyperparameters
            The hyperparameters of the model
        features
            Names of the features.
        feature_metadata
            The feature metadata.
        debug
            Whether to turn on debug mode
        """
        super().__init__(**kwargs)
        self._label_column_name = None
        self._feature_generator = None

    def _preprocess(self, X, fit=False, **kwargs):
        if fit:
            from autogluon.features.generators import BulkFeatureGenerator, CategoryFeatureGenerator, IdentityFeatureGenerator
            # TODO: This feature generator improves scores for TextPrediction when rare categories are present. This should be fixed in TextPrediction.
            self._feature_generator = BulkFeatureGenerator(generators=[
                [
                    CategoryFeatureGenerator(
                        features_in=self.feature_metadata.get_features(
                            valid_raw_types=[R_CATEGORY]),
                        minimum_cat_count=1),
                    IdentityFeatureGenerator(
                        features_in=self.feature_metadata.get_features(
                            invalid_raw_types=[R_CATEGORY])),
                ],
            ],
                                                           verbosity=0)
            self._feature_generator.fit(X)
        return self._feature_generator.transform(X)

    def _build_model(self, X, y, X_val, y_val, hyperparameters):
        try:
            from autogluon.text.text_prediction.text_prediction \
                import ag_text_prediction_params, merge_params, get_column_properties, \
                infer_problem_type, infer_eval_stop_log_metrics
            from autogluon.text.text_prediction.models.basic_v1 import BertForTextPredictionBasic
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)

        # Decide the name of the label column
        if 'label' in X.columns:
            label_col_id = 0
            while True:
                self._label_column_name = 'label{}'.format(label_col_id)
                if self._label_column_name not in X.columns:
                    break
                label_col_id += 1
        else:
            self._label_column_name = 'label'
        if X_val is not None:
            concat_feature_df = pd.concat([X, X_val])
            concat_feature_df.reset_index(drop=True, inplace=True)
            concat_label_df = pd.DataFrame(
                {self._label_column_name: pd.concat([y, y_val])})
            concat_label_df.reset_index(drop=True, inplace=True)
        else:
            concat_feature_df = X
            concat_label_df = pd.DataFrame({self._label_column_name: y})
        feature_column_properties = get_column_properties(
            df=concat_feature_df,
            metadata=None,
            label_columns=None,
            provided_column_properties=None)

        label_column_property = get_column_properties(
            df=concat_label_df,
            metadata=None,
            label_columns=None,
            provided_column_properties=None)
        column_properties = collections.OrderedDict(
            list(feature_column_properties.items()) +
            list(label_column_property.items()))
        problem_type, label_shape = infer_problem_type(
            column_properties=column_properties,
            label_col_name=self._label_column_name)
        eval_metric, stopping_metric, log_metrics =\
            infer_eval_stop_log_metrics(problem_type,
                                        label_shape=label_shape,
                                        eval_metric=self.eval_metric,
                                        stopping_metric=self.stopping_metric)
        search_space = hyperparameters['models']['BertForTextPredictionBasic'][
            'search_space']
        self.model = BertForTextPredictionBasic(
            column_properties=column_properties,
            feature_columns=list(X.columns),
            label_columns=[self._label_column_name],
            problem_types=[problem_type],
            label_shapes=[label_shape],
            stopping_metric=stopping_metric,
            log_metrics=log_metrics,
            output_directory=os.path.join(self.path, self.name),
            logger=logger,
            base_config=None,
            search_space=search_space)
        return column_properties

    def _get_default_auxiliary_params(self) -> dict:
        default_auxiliary_params = super()._get_default_auxiliary_params()
        extra_auxiliary_params = dict(get_features_kwargs=dict(
            valid_raw_types=[R_INT, R_FLOAT, R_CATEGORY, R_OBJECT],
            invalid_special_types=[
                S_TEXT_NGRAM, S_TEXT_AS_CATEGORY, S_TEXT_SPECIAL
            ],
        ), )
        default_auxiliary_params.update(extra_auxiliary_params)
        return default_auxiliary_params

    @classmethod
    def _get_default_ag_args(cls) -> dict:
        default_ag_args = super()._get_default_ag_args()
        extra_ag_args = {'valid_stacker': False}
        default_ag_args.update(extra_ag_args)
        return default_ag_args

    def _set_default_params(self):
        try:
            from autogluon.text.text_prediction.dataset import TabularDataset
            from autogluon.text.text_prediction.text_prediction import ag_text_prediction_params
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)
        super()._set_default_params()
        self.params = ag_text_prediction_params.create('default_no_hpo')

    def _fit(self,
             X: pd.DataFrame,
             y: pd.Series,
             X_val: Optional[pd.DataFrame] = None,
             y_val: Optional[pd.Series] = None,
             time_limit: Optional[int] = None,
             sample_weight=None,
             **kwargs):
        """The internal fit function

        Parameters
        ----------
        X
            Features of the training dataset
        y
            Labels of the training dataset
        X_val
            Features of the validation dataset
        y_val
            Labels of the validation dataset
        time_limit
            The time limits for the fit function
        kwargs
            Other keyword arguments

        """
        try:
            import mxnet as mx
            from autogluon.text.text_prediction.dataset import TabularDataset, random_split_train_val
            from autogluon.text.text_prediction.text_prediction import get_recommended_resource
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)

        time_start = time.time()

        # Get arguments from kwargs
        verbosity = kwargs.get('verbosity', 2)
        num_cpus = kwargs.get('num_cpus', None)
        num_gpus = kwargs.get('num_gpus', None)
        if sample_weight is not None:  # TODO: support
            logger.log(
                15,
                "sample_weight not yet supported for TextPredictionV1Model, this model will ignore them in training."
            )

        # Infer resource
        resource = get_recommended_resource(nthreads_per_trial=num_cpus,
                                            ngpus_per_trial=num_gpus)

        if resource['num_gpus'] == 0:
            raise NoGPUError(
                f'\tNo GPUs available to train {self.name}. Resources: {resource}'
            )

        # Set seed
        seed = self.params.get('seed')
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)
            mx.random.seed(seed)

        X = self.preprocess(X, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)

        if not self.feature_metadata.get_features(valid_raw_types=['object']):
            raise NoValidFeatures(f'No text features to train {self.name}.')

        column_properties = self._build_model(X=X,
                                              y=y,
                                              X_val=X_val,
                                              y_val=y_val,
                                              hyperparameters=self.params)
        # Insert the label column
        X.insert(len(X.columns), self._label_column_name, y)
        if X_val is not None:
            X_val.insert(len(X_val.columns), self._label_column_name, y_val)
        scheduler_options = self.params['hpo_params']['scheduler_options']
        search_strategy = self.params['hpo_params']['search_strategy']
        if scheduler_options is None:
            scheduler_options = dict()
        if search_strategy.endswith('hyperband'):
            # Specific defaults for hyperband scheduling
            scheduler_options['reduction_factor'] = scheduler_options.get(
                'reduction_factor', 4)
            scheduler_options['grace_period'] = scheduler_options.get(
                'grace_period', 10)
            scheduler_options['max_t'] = scheduler_options.get('max_t', 50)
        if X_val is None:
            # FIXME: v0.1 Update TextPrediction to use all training data in refit_full
            holdout_frac = default_holdout_frac(len(X), True)
            X, X_val = random_split_train_val(X, valid_ratio=holdout_frac)
        train_data = TabularDataset(X,
                                    column_properties=column_properties,
                                    label_columns=self._label_column_name)
        logger.log(15, 'Train Dataset:')
        logger.log(15, train_data)
        tuning_data = TabularDataset(X_val,
                                     column_properties=column_properties,
                                     label_columns=self._label_column_name)
        logger.log(15, 'Tuning Dataset:')
        logger.log(15, tuning_data)

        if time_limit is not None:
            time_limit = time_limit - (time.time() - time_start)

        # FIXME: Inner error message if no text features is not helpful
        self.model.train(
            train_data=train_data,
            tuning_data=tuning_data,
            resource=resource,
            time_limits=time_limit,
            search_strategy=search_strategy,
            search_options=self.params['hpo_params']['search_options'],
            scheduler_options=scheduler_options,
            num_trials=self.params['hpo_params']['num_trials'],
            console_log=verbosity >= 3,
            ignore_warning=verbosity < 3,
            verbosity=verbosity - 1)

    def save(self, path: str = None, verbose=True) -> str:
        model = self.model
        self.model = None
        # save this AbstractModel object without NN weights
        path = super().save(path=path, verbose=verbose)
        self.model = model

        text_nn_path = os.path.join(path, self.nn_model_name)
        model.save(text_nn_path)
        logger.log(
            15,
            f"\tSaved Text NN weights and model hyperparameters to '{text_nn_path}'."
        )

        return path

    @classmethod
    def load(cls, path: str, reset_paths=True, verbose=True):
        try:
            from autogluon.text.text_prediction.dataset import TabularDataset
            from autogluon.text.text_prediction.models.basic_v1 import BertForTextPredictionBasic
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)

        model = super().load(path=path,
                             reset_paths=reset_paths,
                             verbose=verbose)
        model.model = BertForTextPredictionBasic.load(
            os.path.join(path, cls.nn_model_name))
        return model

    def get_memory_size(self) -> int:
        """Return the memory size by calculating the total number of parameters.

        Returns
        -------
        memory_size
            The total memory size in bytes.
        """
        total_size = 0
        for k, v in self.model.net.collect_params().items():
            total_size += np.dtype(v.dtype).itemsize * np.prod(v.shape)
        return total_size

    def _get_default_resources(self):
        num_cpus = get_cpu_count()
        num_gpus = get_gpu_count()
        return num_cpus, num_gpus