Пример #1
0
def create(dataset,
           target,
           features=None,
           max_iterations=10,
           validation_set="auto",
           class_weights=None,
           max_depth=6,
           step_size=0.3,
           min_loss_reduction=0.0,
           min_child_weight=0.1,
           row_subsample=1.0,
           column_subsample=1.0,
           verbose=True,
           random_seed=None,
           metric="auto",
           **kwargs):
    """
    Create a (binary or multi-class) classifier model of type
    :class:`~turicreate.boosted_trees_classifier.BoostedTreesClassifier` using
    gradient boosted trees (sometimes known as GBMs).

    Parameters
    ----------
    dataset : SFrame
        A training dataset containing feature columns and a target column.

    target : str
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.  String target variables are
        automatically mapped to integers in alphabetical order of the variable values.
        For example, a target variable with 'cat', 'dog', and 'foosa' as possible
        values is mapped to 0, 1, and, 2 respectively.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, which uses all columns in the SFrame ``dataset``
        excepting the target column..

    max_iterations : int, optional
        The maximum number of iterations for boosting. Each iteration results
        in the creation of an extra tree.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. This is computed once per full iteration. Large
        differences in model accuracy between the training data and validation
        data is indicative of overfitting. The default value is 'auto'.

    class_weights : {dict, `auto`}, optional

        Weights the examples in the training data according to the given class
        weights. If provided, the dictionary must contain a key for each class
        label. The value can be any positive number greater than 1e-20. Weights
        are interpreted as relative to each other. So setting the weights to be
        2.0 for the positive class and 1.0 for the negative class has the same
        effect as setting them to be 20.0 and 10.0, respectively. If set to
        `None`, all classes are taken to have weight 1.0. The `auto` mode sets
        the class weight to be inversely proportional to the number of examples
        in the training data with the given class.

    max_depth : float, optional
        Maximum depth of a tree. Must be at least 1.

    step_size : float, [0,1], optional
        Step size (shrinkage) used in update to prevents overfitting.  It
        shrinks the prediction of each weak learner to make the boosting
        process more conservative.  The smaller the step size, the more conservative
        the algorithm will be. Smaller step_size work well when
        `max_iterations` is large.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition/split a
        node during the tree learning phase. Larger (more positive) values
        can help prevent overfitting by avoiding splits that do not
        sufficiently reduce the loss function.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    row_subsample : float, [0,1], optional
        Subsample the ratio of the training set in each iteration of tree
        construction.  This is called the bagging trick and can usually help
        prevent overfitting.  Setting this to a value of 0.5 results in the
        model randomly sampling half of the examples (rows) to grow each tree.

    column_subsample : float, [0,1], optional
        Subsample ratio of the columns in each iteration of tree
        construction.  Like row_subsample, this can also help prevent
        model overfitting.  Setting this to a value of 0.5 results in the
        model randomly sampling half of the columns to grow each tree.

    verbose : boolean, optional
        Print progress information during training (if set to true).

    random_seed : int, optional
        Seeds random opertations such as column and row subsampling, such that
        results are reproducable.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'accuracy', 'auc', 'log_loss'}

    kwargs : dict, optional
        Additional arguments for training the model.

        - ``early_stopping_rounds`` : int, default None
            If the validation metric does not improve after <early_stopping_rounds>,
            stop training and return the best model.
            If multiple metrics are being tracked, the last one is used.

        - ``model_checkpoint_path`` : str, default None
            If specified, checkpoint the model training to the given path every n iterations,
            where n is specified by ``model_checkpoint_interval``.
            For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
            set to ``/tmp/model_tmp``, the checkpoints will be saved into
            ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
            Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

        - ``model_checkpoint_interval`` : int, default 5
            If model_check_point_path is specified,
            save the model to the given path every n iterations.

        - ``resume_from_checkpoint`` : str, default None
            Continues training from a model checkpoint. The model must take
            exact the same training data as the checkpointed model.

    Returns
    -------
      out : BoostedTreesClassifier
          A trained gradient boosted trees model for classifications tasks.

    References
    ----------

    - `Wikipedia - Gradient tree boosting
      <http://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting>`_
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    BoostedTreesClassifier, turicreate.logistic_classifier.LogisticClassifier, turicreate.svm_classifier.SVMClassifier

    Examples
    --------

    .. sourcecode:: python

      >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
      >>> data = turicreate.SFrame.read_csv(url)

      >>> train, test = data.random_split(0.8)
      >>> model = turicreate.boosted_trees_classifier.create(train, target='label')

      >>> predictions = model.classify(test)
      >>> results = model.evaluate(test)
    """
    if random_seed is not None:
        kwargs["random_seed"] = random_seed
    if "model_checkpoint_path" in kwargs:
        kwargs["model_checkpoint_path"] = _make_internal_url(
            kwargs["model_checkpoint_path"])
    if "resume_from_checkpoint" in kwargs:
        kwargs["resume_from_checkpoint"] = _make_internal_url(
            kwargs["resume_from_checkpoint"])

    model = _sl.create(dataset=dataset,
                       target=target,
                       features=features,
                       model_name="boosted_trees_classifier",
                       max_iterations=max_iterations,
                       validation_set=validation_set,
                       class_weights=class_weights,
                       max_depth=max_depth,
                       step_size=step_size,
                       min_loss_reduction=min_loss_reduction,
                       min_child_weight=min_child_weight,
                       row_subsample=row_subsample,
                       column_subsample=column_subsample,
                       verbose=verbose,
                       metric=metric,
                       **kwargs)
    return BoostedTreesClassifier(model.__proxy__)
def create(dataset,
           target,
           features=None,
           validation_set='auto',
           class_weights=None,
           max_depth=6,
           min_loss_reduction=0.0,
           min_child_weight=0.1,
           verbose=True,
           random_seed=None,
           metric='auto',
           **kwargs):
    """
    Create a (binary or multi-class) classifier model of type
    :class:`~turicreate.decision_tree_classifier.DecisionTreeClassifier`. This
    algorithm is a special case of boosted trees classifier with the number
    of trees set to 1.

    Parameters
    ----------
    dataset : SFrame
        A training dataset containing feature columns and a target column.

    target : str
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.  String target variables are
        automatically mapped to integers in alphabetical order of the variable values.
        For example, a target variable with 'cat', 'dog', and 'foosa' as possible
        values is mapped to 0, 1, and, 2 respectively.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, which uses all columns in the SFrame ``dataset``
        excepting the target column..

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. This is computed once per full iteration. Large
        differences in model accuracy between the training data and validation
        data is indicative of overfitting. The default value is 'auto'.

    class_weights : {dict, `auto`}, optional

        Weights the examples in the training data according to the given class
        weights. If provided, the dictionary must contain a key for each class
        label. The value can be any positive number greater than 1e-20. Weights
        are interpreted as relative to each other. So setting the weights to be
        2.0 for the positive class and 1.0 for the negative class has the same
        effect as setting them to be 20.0 and 10.0, respectively. If set to
        `None`, all classes are taken to have weight 1.0. The `auto` mode sets
        the class weight to be inversely proportional to the number of examples
        in the training data with the given class.

    max_depth : float, optional
        Maximum depth of a tree. Must be at least 1.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition/split a
        node during the tree learning phase. Larger (more positive) values
        can help prevent overfitting by avoiding splits that do not
        sufficiently reduce the loss function.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    verbose : boolean, optional
        Print progress information during training (if set to true).

    random_seed : int, optional
        Seeds random operations such as column and row subsampling, such that
        results are reproducible.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'accuracy', 'auc', 'log_loss'}

    Returns
    -------
      out : DecisionTreeClassifier
          A trained decision tree model for classifications tasks.

    References
    ----------

    - `Wikipedia - Gradient tree boosting
      <http://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting>`_
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    turicreate.logistic_classifier.LogisticClassifier, turicreate.svm_classifier.SVMClassifier

    Examples
    --------

    .. sourcecode:: python

      >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
      >>> data = turicreate.SFrame.read_csv(url)

      >>> train, test = data.random_split(0.8)
      >>> model = turicreate.decision_tree_classifier.create(train, target='label')

      >>> predictions = model.classify(test)
      >>> results = model.evaluate(test)
    """
    if random_seed is not None:
        kwargs['random_seed'] = random_seed

    model = _sl.create(dataset=dataset,
                       target=target,
                       features=features,
                       model_name='decision_tree_classifier',
                       validation_set=validation_set,
                       class_weights=class_weights,
                       max_depth=max_depth,
                       min_loss_reduction=min_loss_reduction,
                       min_child_weight=min_child_weight,
                       verbose=verbose,
                       metric=metric,
                       **kwargs)
    return DecisionTreeClassifier(model.__proxy__)
def create(dataset,
           target,
           features=None,
           max_iterations=10,
           validation_set='auto',
           verbose=True,
           random_seed=None,
           metric='auto',
           **kwargs):
    """
    Create a :class:`~turicreate.random_forest_regression.RandomForestRegression` to predict
    a scalar target variable using one or more features. In addition to standard
    numeric and categorical types, features can also be extracted automatically
    from list- or dictionary-type SFrame columns.


    Parameters
    ----------
    dataset : SFrame
        A training dataset containing feature columns and a target column.
        Only numerical typed (int, float) target column is allowed.

    target : str
        The name of the column in ``dataset`` that is the prediction target.
        This column must have a numeric type.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, using all columns.

    max_iterations : int, optional
        The number of iterations to perform.

    max_depth : float, optional
        Maximum depth of a tree.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition/split a
        node during the tree learning phase. Larger (more positive) values
        can help prevent overfitting by avoiding splits that do not
        sufficiently reduce the loss function.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    row_subsample : float, optional
        Subsample the ratio of the training set in each iteration of tree
        construction.  This is called the bagging trick and usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the examples (rows) to grow each tree.

    column_subsample : float, optional
        Subsample ratio of the columns in each iteration of tree
        construction.  Like row_subsample, this also usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the columns to grow each tree.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. This is computed once per full iteration. Large
        differences in model accuracy between the training data and validation
        data is indicative of overfitting. The default value is 'auto'.


    verbose : boolean, optional
        If True, print progress information during training.

    random_seed : int, optional
        Seeds random operations such as column and row subsampling, such that
        results are reproducible.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'rmse', 'max_error'}

    kwargs : dict, optional
        Additional arguments for training the model.

        - ``model_checkpoint_path`` : str, default None
            If specified, checkpoint the model training to the given path every n iterations,
            where n is specified by ``model_checkpoint_interval``.
            For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
            set to ``/tmp/model_tmp``, the checkpoints will be saved into
            ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
            Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

        - ``model_checkpoint_interval`` : int, default 5
            If model_check_point_path is specified,
            save the model to the given path every n iterations.

        - ``resume_from_checkpoint`` : str, default None
            Continues training from a model checkpoint. The model must take
            exact the same training data as the checkpointed model.

    Returns
    -------
      out : RandomForestRegression
          A trained random forest model for regression tasks.

    References
    ----------
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    RandomForestRegression, turicreate.linear_regression.LinearRegression, turicreate.regression.create

    Examples
    --------

    Setup the data:

    >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
    >>> data = turicreate.SFrame.read_csv(url)
    >>> data['label'] = data['label'] == 'p'

    Split the data into training and test data:

    >>> train, test = data.random_split(0.8)

    Create the model:

    >>> model = turicreate.random_forest_regression.create(train, target='label')

    Make predictions and evaluate the model:

    >>> predictions = model.predict(test)
    >>> results = model.evaluate(test)

    """
    if random_seed is not None:
        kwargs['random_seed'] = random_seed
    if 'model_checkpoint_path' in kwargs:
        kwargs['model_checkpoint_path'] = _make_internal_url(
            kwargs['model_checkpoint_path'])
    if 'resume_from_checkpoint' in kwargs:
        kwargs['resume_from_checkpoint'] = _make_internal_url(
            kwargs['resume_from_checkpoint'])
    if 'num_trees' in kwargs:
        logger = _logging.getLogger(__name__)
        logger.warning(
            "The `num_trees` keyword argument is deprecated. Please "
            "use the `max_iterations` argument instead. Any value provided "
            "for `num_trees` will be used in place of `max_iterations`.")
        max_iterations = kwargs['num_trees']
        del kwargs['num_trees']

    model = _sl.create(dataset=dataset,
                       target=target,
                       features=features,
                       model_name='random_forest_regression',
                       max_iterations=max_iterations,
                       validation_set=validation_set,
                       verbose=verbose,
                       metric=metric,
                       **kwargs)
    return RandomForestRegression(model.__proxy__)
Пример #4
0
def create(dataset, target, features=None,
    l2_penalty=0.01, l1_penalty=0.0,
    solver='auto', feature_rescaling=True,
    convergence_threshold = _DEFAULT_SOLVER_OPTIONS['convergence_threshold'],
    step_size = _DEFAULT_SOLVER_OPTIONS['step_size'],
    lbfgs_memory_level = _DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'],
    max_iterations = _DEFAULT_SOLVER_OPTIONS['max_iterations'],
    class_weights = None,
    validation_set = 'auto',
    verbose=True,
    seed=None):
    """
    Create a :class:`~turicreate.logistic_classifier.LogisticClassifier` (using
    logistic regression as a classifier) to predict the class of a discrete
    target variable (binary or multiclass) based on a model of class probability
    as a logistic function of a linear combination of the features.  In addition
    to standard numeric and categorical types, features can also be extracted
    automatically from list or dictionary-type SFrame columns.

    This model can be regularized with an l1 penalty, an l2 penalty, or both. By
    default this model has an l2 regularization weight of 0.01.

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string or int
        Name of the column containing the target variable. The values in this
        column must be of string or integer type. String target variables are
        automatically mapped to integers in the order in which they are provided.
        For example, a target variable with 'cat' and 'dog' as possible
        values is mapped to 0 and 1 respectively with 0 being the base class
        and 1 being the reference class. Use `model.classes` to retrieve
        the order in which the classes are mapped.

    features : list[string], optional
        Names of the columns containing features. 'None' (the default) indicates
        that all columns except the target variable should be used as features.

        The features are columns in the input SFrame that can be of the
        following types:

        - *Numeric*: values of numeric type integer or float.

        - *Categorical*: values of type string.

        - *Array*: list of numeric (integer or float) values. Each list element
          is treated as a separate feature in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values
          Each key of a dictionary is treated as a separate feature and the
          value in the dictionary corresponds to the value of the feature.
          Dictionaries are ideal for representing sparse data.

        Columns of type *list* are not supported. Convert such feature
        columns to type array if all entries in the list are of numeric
        types. If the lists contain data of mixed types, separate
        them out into different columns.

    l2_penalty : float, optional
        Weight on l2 regularization of the model. The larger this weight, the
        more the model coefficients shrink toward 0. This introduces bias into
        the model but decreases variance, potentially leading to better
        predictions. The default value is 0.01; setting this parameter to 0
        corresponds to unregularized logistic regression. See the ridge
        regression reference for more detail.

    l1_penalty : float, optional
        Weight on l1 regularization of the model. Like the l2 penalty, the
        higher the l1 penalty, the more the estimated coefficients shrink toward
        0. The l1 penalty, however, completely zeros out sufficiently small
        coefficients, automatically indicating features that are not useful
        for the model. The default weight of 0 prevents any features from
        being discarded. See the LASSO regression reference for more detail.

    solver : string, optional
        Name of the solver to be used to solve the regression. See the
        references for more detail on each solver. Available solvers are:

        - *auto (default)*: automatically chooses the best solver for the data
          and model parameters.
        - *newton*: Newton-Raphson
        - *lbfgs*: limited memory BFGS
        - *fista*: accelerated gradient descent

        For this model, the Newton-Raphson method is equivalent to the
        iteratively re-weighted least squares algorithm. If the l1_penalty is
        greater than 0, use the 'fista' solver.

        The model is trained using a carefully engineered collection of methods
        that are automatically picked based on the input data. The ``newton``
        method  works best for datasets with plenty of examples and few features
        (long datasets). Limited memory BFGS (``lbfgs``) is a robust solver for
        wide datasets (i.e datasets with many coefficients).  ``fista`` is the
        default solver for l1-regularized linear regression. The solvers are all
        automatically tuned and the default options should function well. See
        the solver options guide for setting additional parameters for each of
        the solvers.

        See the user guide for additional details on how the solver is chosen.
        (see `here
        <https://apple.github.io/turicreate/docs/userguide/supervised-learning/linear-regression.html>`_)



    feature_rescaling : boolean, optional

        Feature rescaling is an important pre-processing step that ensures that
        all features are on the same scale. An l2-norm rescaling is performed
        to make sure that all features are of the same norm. Categorical
        features are also rescaled by rescaling the dummy variables that are
        used to represent them. The coefficients are returned in original scale
        of the problem. This process is particularly useful when features
        vary widely in their ranges.


    convergence_threshold : float, optional

        Convergence is tested using variation in the training objective. The
        variation in the training objective is calculated using the difference
        between the objective values between two steps. Consider reducing this
        below the default value (0.01) for a more accurately trained model.
        Beware of overfitting (i.e a model that works well only on the training
        data) if this parameter is set to a very low value.

    lbfgs_memory_level : float, optional

        The L-BFGS algorithm keeps track of gradient information from the
        previous ``lbfgs_memory_level`` iterations. The storage requirement for
        each of these gradients is the ``num_coefficients`` in the problem.
        Increasing the ``lbfgs_memory_level ``can help improve the quality of
        the model trained. Setting this to more than ``max_iterations`` has the
        same effect as setting it to ``max_iterations``.

    max_iterations : float, optional

        The maximum number of allowed passes through the data. More passes over
        the data can result in a more accurately trained model. Consider
        increasing this (the default value is 10) if the training accuracy is
        low and the *Grad-Norm* in the display is large.

    step_size : float, optional

        The starting step size to use for the ``fista`` solver. The default is
        set to 1.0, this is an aggressive setting. If the first iteration takes
        a considerable amount of time, reducing this parameter may speed up
        model training.

    class_weights : {dict, `auto`}, optional

        Weights the examples in the training data according to the given class
        weights. If set to `None`, all classes are supposed to have weight one. The
        `auto` mode set the class weight to be inversely proportional to number of
        examples in the training data with the given class.

    validation_set : SFrame, optional

        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    verbose : bool, optional
        If True, print progress updates.

    seed : int, optional
        Seed for random number generation. Set this value to ensure that the
        same model is created every time.

    Returns
    -------
    out : LogisticClassifier
        A trained model of type
        :class:`~turicreate.logistic_classifier.LogisticClassifier`.

    See Also
    --------
    LogisticClassifier, turicreate.boosted_trees_classifier.BoostedTreesClassifier,
    turicreate.svm_classifier.SVMClassifier, turicreate.classifier.create

    Notes
    -----
    - Categorical variables are encoded by creating dummy variables. For a
      variable with :math:`K` categories, the encoding creates :math:`K-1` dummy
      variables, while the first category encountered in the data is used as the
      baseline.

    - For prediction and evaluation of logistic regression models with sparse
      dictionary inputs, new keys/columns that were not seen during training
      are silently ignored.

    - During model creation, 'None' values in the data will result in an error
      being thrown.

    - A constant term is automatically added for the model intercept. This term
      is not regularized.

    - Standard errors on coefficients are only availiable when `solver=newton`
      or when the default `auto` solver option choses the newton method and if
      the number of examples in the training data is more than the number of
      coefficients. If standard errors cannot be estimated, a column of `None`
      values are returned.


    References
    ----------
    - `Wikipedia - logistic regression
      <http://en.wikipedia.org/wiki/Logistic_regression>`_

    - Hoerl, A.E. and Kennard, R.W. (1970) `Ridge regression: Biased Estimation
      for Nonorthogonal Problems
      <http://amstat.tandfonline.com/doi/abs/10.1080/00401706.1970.10488634>`_.
      Technometrics 12(1) pp.55-67

    - Tibshirani, R. (1996) `Regression Shrinkage and Selection via the Lasso <h
      ttp://www.jstor.org/discover/10.2307/2346178?uid=3739256&uid=2&uid=4&sid=2
      1104169934983>`_. Journal of the Royal Statistical Society. Series B
      (Methodological) 58(1) pp.267-288.

    - Zhu, C., et al. (1997) `Algorithm 778: L-BFGS-B: Fortran subroutines for
      large-scale bound-constrained optimization
      <https://dl.acm.org/citation.cfm?id=279236>`_. ACM Transactions on
      Mathematical Software 23(4) pp.550-560.

    - Beck, A. and Teboulle, M. (2009) `A Fast Iterative Shrinkage-Thresholding
      Algorithm for Linear Inverse Problems
      <http://epubs.siam.org/doi/abs/10.1137/080716542>`_. SIAM Journal on
      Imaging Sciences 2(1) pp.183-202.


    Examples
    --------

    Given an :class:`~turicreate.SFrame` ``sf``, a list of feature columns
    [``feature_1`` ... ``feature_K``], and a target column ``target`` with 0 and
    1 values, create a
    :class:`~turicreate.logistic_classifier.LogisticClassifier` as follows:

    >>> data =  turicreate.SFrame('https://static.turi.com/datasets/regression/houses.csv')
    >>> data['is_expensive'] = data['price'] > 30000
    >>> model = turicreate.logistic_classifier.create(data, 'is_expensive')

    By default all columns of ``data`` except the target are used as features, but
    specific feature columns can be specified manually.

    >>> model = turicreate.logistic_classifier.create(data, 'is_expensive', ['bedroom', 'size'])


    .. sourcecode:: python

      # L2 regularizer
      >>> model_ridge = turicreate.logistic_classifier.create(data, 'is_expensive', l2_penalty=0.1)

      # L1 regularizer
      >>> model_lasso = turicreate.logistic_classifier.create(data, 'is_expensive', l2_penalty=0.,
                                                                   l1_penalty=1.0)

      # Both L1 and L2 regularizer
      >>> model_enet  = turicreate.logistic_classifier.create(data, 'is_expensive', l2_penalty=0.5, l1_penalty=0.5)

    """


    # Regression model names.
    model_name = "classifier_logistic_regression"
    solver = solver.lower()

    model = _sl.create(dataset, target, model_name, features=features,
                        validation_set = validation_set, verbose = verbose,
                        l2_penalty=l2_penalty, l1_penalty = l1_penalty,
                        feature_rescaling = feature_rescaling,
                        convergence_threshold = convergence_threshold,
                        step_size = step_size,
                        solver = solver,
                        lbfgs_memory_level = lbfgs_memory_level,
                        max_iterations = max_iterations,
                        class_weights = class_weights,
                        seed=seed)

    return LogisticClassifier(model.__proxy__)
Пример #5
0
def create(dataset,
           target,
           features=None,
           max_iterations=10,
           validation_set='auto',
           max_depth=6,
           step_size=0.3,
           min_loss_reduction=0.0,
           min_child_weight=0.1,
           row_subsample=1.0,
           column_subsample=1.0,
           verbose=True,
           random_seed=None,
           metric='auto',
           **kwargs):
    """
    Create a :class:`~turicreate.boosted_trees_regression.BoostedTreesRegression` to predict
    a scalar target variable using one or more features. In addition to standard
    numeric and categorical types, features can also be extracted automatically
    from list- or dictionary-type SFrame columns.


    Parameters
    ----------
    dataset : SFrame
        A training dataset containing feature columns and a target column.
        Only numerical typed (int, float) target column is allowed.

    target : str
        The name of the column in ``dataset`` that is the prediction target.
        This column must have a numeric type.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, using all columns.

    max_iterations : int, optional
        The number of iterations for boosting. It is also the number of trees
        in the model.

    validation_set : SFrame, optional
        The validation set that is used to watch the validation result as
        boosting progress.

    max_depth : float, optional
        Maximum depth of a tree. Must be at least 1.

    step_size : float, [0,1],  optional
        Step size (shrinkage) used in update to prevents overfitting.  It
        shrinks the prediction of each weak learner to make the boosting
        process more conservative.  The smaller, the more conservative the
        algorithm will be. Smaller step_size is usually used together with
        larger max_iterations.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition/split a
        node during the tree learning phase. Larger (more positive) values
        can help prevent overfitting by avoiding splits that do not
        sufficiently reduce the loss function.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    row_subsample : float, [0,1], optional
        Subsample the ratio of the training set in each iteration of tree
        construction.  This is called the bagging trick and usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the examples (rows) to grow each tree.

    column_subsample : float, [0,1], optional
        Subsample ratio of the columns in each iteration of tree
        construction.  Like row_subsample, this also usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the columns to grow each tree.

    verbose : boolean, optional
        If True, print progress information during training.

    random_seed: int, optional
        Seeds random operations such as column and row subsampling, such that
        results are reproducible.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'rmse', 'max_error'}

    kwargs : dict, optional
        Additional arguments for training the model.

        - ``early_stopping_rounds`` : int, default None
            If the validation metric does not improve after <early_stopping_rounds>,
            stop training and return the best model.
            If multiple metrics are being tracked, the last one is used.

        - ``model_checkpoint_path`` : str, default None
            If specified, checkpoint the model training to the given path every n iterations,
            where n is specified by ``model_checkpoint_interval``.
            For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
            set to ``/tmp/model_tmp``, the checkpoints will be saved into
            ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
            Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

        - ``model_checkpoint_interval`` : int, default 5
            If model_check_point_path is specified,
            save the model to the given path every n iterations.

        - ``resume_from_checkpoint`` : str, default None
            Continues training from a model checkpoint. The model must take
            exact the same training data as the checkpointed model.

    Returns
    -------
      out : BoostedTreesRegression
          A trained gradient boosted trees model

    References
    ----------
    - `Wikipedia - Gradient tree boosting
      <http://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting>`_
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    BoostedTreesRegression, turicreate.linear_regression.LinearRegression, turicreate.regression.create

    Examples
    --------

    Setup the data:

    >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
    >>> data = turicreate.SFrame.read_csv(url)
    >>> data['label'] = data['label'] == 'p'

    Split the data into training and test data:

    >>> train, test = data.random_split(0.8)

    Create the model:

    >>> model = turicreate.boosted_trees_regression.create(train, target='label')

    Make predictions and evaluate the model:

    >>> predictions = model.predict(test)
    >>> results = model.evaluate(test)

    """
    if random_seed is not None:
        kwargs['random_seed'] = random_seed
    if 'model_checkpoint_path' in kwargs:
        kwargs['model_checkpoint_path'] = _make_internal_url(
            kwargs['model_checkpoint_path'])
    if 'resume_from_checkpoint' in kwargs:
        kwargs['resume_from_checkpoint'] = _make_internal_url(
            kwargs['resume_from_checkpoint'])

    model = _sl.create(dataset=dataset,
                       target=target,
                       features=features,
                       model_name='boosted_trees_regression',
                       max_iterations=max_iterations,
                       validation_set=validation_set,
                       max_depth=max_depth,
                       step_size=step_size,
                       min_loss_reduction=min_loss_reduction,
                       min_child_weight=min_child_weight,
                       row_subsample=row_subsample,
                       column_subsample=column_subsample,
                       verbose=verbose,
                       metric=metric,
                       **kwargs)
    return BoostedTreesRegression(model.__proxy__)
Пример #6
0
def create(
        dataset,
        target,
        features=None,
        penalty=1.0,
        solver='auto',
        feature_rescaling=True,
        convergence_threshold=_DEFAULT_SOLVER_OPTIONS['convergence_threshold'],
        lbfgs_memory_level=_DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'],
        max_iterations=_DEFAULT_SOLVER_OPTIONS['max_iterations'],
        class_weights=None,
        validation_set='auto',
        verbose=True):
    """
    Create a :class:`~turicreate.svm_classifier.SVMClassifier` to predict the class of a binary
    target variable based on a model of which side of a hyperplane the example
    falls on. In addition to standard numeric and categorical types, features
    can also be extracted automatically from list- or dictionary-type SFrame
    columns.

    This loss function for the SVM model is the sum of an L1 mis-classification
    loss (multiplied by the 'penalty' term) and a l2-norm on the weight vectors.

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be of string or integer type. String target variables are
        automatically mapped to integers in alphabetical order of the variable
        values. For example, a target variable with 'cat' and 'dog' as possible
        values is mapped to 0 and 1 respectively with 0 being the base class
        and 1 being the reference class.

    features : list[string], optional
        Names of the columns containing features. 'None' (the default) indicates
        that all columns except the target variable should be used as features.

        The features are columns in the input SFrame that can be of the
        following types:

        - *Numeric*: values of numeric type integer or float.

        - *Categorical*: values of type string.

        - *Array*: list of numeric (integer or float) values. Each list element
          is treated as a separate feature in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values
          Each key of a dictionary is treated as a separate feature and the
          value in the dictionary corresponds to the value of the feature.
          Dictionaries are ideal for representing sparse data.

        Columns of type *list* are not supported. Convert them to array in
        case all entries in the list are of numeric types and separate them
        out into different columns if they are of mixed type.

    penalty : float, optional
        Penalty term on the mis-classification loss of the model. The larger
        this weight, the more the model coefficients shrink toward 0.  The
        larger the penalty, the lower is the emphasis placed on misclassified
        examples, and the classifier would spend more time maximizing the
        margin for correctly classified examples. The default value is 1.0;
        this parameter must be set to a value of at least 1e-10.


    solver : string, optional
        Name of the solver to be used to solve the problem. See the
        references for more detail on each solver. Available solvers are:

        - *auto (default)*: automatically chooses the best solver (from the ones
          listed below) for the data and model parameters.
        - *lbfgs*: lLimited memory BFGS (``lbfgs``) is a robust solver for wide
          datasets(i.e datasets with many coefficients).

        The solvers are all automatically tuned and the default options should
        function well. See the solver options guide for setting additional
        parameters for each of the solvers.

    feature_rescaling : bool, default = true

        Feature rescaling is an important pre-processing step that ensures
        that all features are on the same scale. An l2-norm rescaling is
        performed to make sure that all features are of the same norm. Categorical
        features are also rescaled by rescaling the dummy variables that
        are used to represent them. The coefficients are returned in original
        scale of the problem.

    convergence_threshold :

        Convergence is tested using variation in the training objective. The
        variation in the training objective is calculated using the difference
        between the objective values between two steps. Consider reducing this
        below the default value (0.01) for a more accurately trained model.
        Beware of overfitting (i.e a model that works well only on the training
        data) if this parameter is set to a very low value.

    max_iterations : int, optional

        The maximum number of allowed passes through the data. More passes over
        the data can result in a more accurately trained model. Consider
        increasing this (the default value is 10) if the training accuracy is
        low and the *Grad-Norm* in the display is large.

    lbfgs_memory_level : int, optional

        The L-BFGS algorithm keeps track of gradient information from the
        previous ``lbfgs_memory_level`` iterations. The storage requirement for
        each of these gradients is the ``num_coefficients`` in the problem.
        Increasing the ``lbfgs_memory_level`` can help improve the quality of
        the model trained. Setting this to more than ``max_iterations`` has the
        same effect as setting it to ``max_iterations``.

    class_weights : {dict, `auto`}, optional

        Weights the examples in the training data according to the given class
        weights. If set to `None`, all classes are supposed to have weight one. The
        `auto` mode set the class weight to be inversely proportional to number of
        examples in the training data with the given class.

    validation_set : SFrame, optional

        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    verbose : bool, optional
        If True, print progress updates.

    Returns
    -------
    out : SVMClassifier
        A trained model of type
        :class:`~turicreate.svm_classifier.SVMClassifier`.

    See Also
    --------
    SVMClassifier

    Notes
    -----
    - Categorical variables are encoded by creating dummy variables. For
      a variable with :math:`K` categories, the encoding creates :math:`K-1`
      dummy variables, while the first category encountered in the data is used
      as the baseline.

    - For prediction and evaluation of SVM models with sparse dictionary
      inputs, new keys/columns that were not seen during training are silently
      ignored.

    - The penalty parameter is analogous to the 'C' term in the C-SVM. See the
      reference on training SVMs for more details.

    - Any 'None' values in the data will result in an error being thrown.

    - A constant term of '1' is automatically added for the model intercept to
      model the bias term.

    - Note that the hinge loss is approximated by the scaled logistic loss
      function. (See user guide for details)

    References
    ----------
    - `Wikipedia - Support Vector Machines
      <http://en.wikipedia.org/wiki/svm>`_

    - Zhang et al. - Modified Logistic Regression: An Approximation to
      SVM and its Applications in Large-Scale Text Categorization (ICML 2003)


    Examples
    --------

    Given an :class:`~turicreate.SFrame` ``sf``, a list of feature columns
    [``feature_1`` ... ``feature_K``], and a target column ``target`` with 0 and
    1 values, create a
    :class:`~turicreate.svm.SVMClassifier` as follows:

    >>> data =  turicreate.SFrame('https://static.turi.com/datasets/regression/houses.csv')
    >>> data['is_expensive'] = data['price'] > 30000
    >>> model = turicreate.svm_classifier.create(data, 'is_expensive')
    """

    # Regression model names.
    model_name = "classifier_svm"
    solver = solver.lower()

    model = _sl.create(dataset,
                       target,
                       model_name,
                       features=features,
                       validation_set=validation_set,
                       verbose=verbose,
                       penalty=penalty,
                       feature_rescaling=feature_rescaling,
                       convergence_threshold=convergence_threshold,
                       lbfgs_memory_level=lbfgs_memory_level,
                       max_iterations=max_iterations,
                       class_weights=class_weights)

    return SVMClassifier(model.__proxy__)
Пример #7
0
def create(dataset, target,
           features=None,
           max_iterations=10,
           validation_set='auto',
           verbose=True, class_weights=None,
           random_seed=None,
           metric='auto',
           **kwargs):
    """
    Create a (binary or multi-class) classifier model of type
    :class:`~turicreate.random_forest_classifier.RandomForestClassifier` using
    an ensemble of decision trees trained on subsets of the data.

    Parameters
    ----------
    dataset : SFrame
        A training dataset containing feature columns and a target column.

    target : str
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.  String target variables are
        automatically mapped to integers in alphabetical order of the variable values.
        For example, a target variable with 'cat', 'dog', and 'foosa' as possible
        values is mapped to 0, 1, and, 2 respectively.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, which uses all columns in the SFrame ``dataset``
        excepting the target column..

    max_iterations : int, optional
        The maximum number of iterations to perform. For multi-class
        classification with K classes, each iteration will create K-1 trees.

    max_depth : float, optional
        Maximum depth of a tree.

    class_weights : {dict, `auto`}, optional
        Weights the examples in the training data according to the given class
        weights. If set to `None`, all classes are supposed to have weight one. The
        `auto` mode set the class weight to be inversely proportional to number of
        examples in the training data with the given class.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition on a
        leaf node of the tree. The larger it is, the more conservative the
        algorithm will be. Must be non-negative.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    row_subsample : float, optional
        Subsample the ratio of the training set in each iteration of tree
        construction.  This is called the bagging trick and can usually help
        prevent overfitting.  Setting this to a value of 0.5 results in the
        model randomly sampling half of the examples (rows) to grow each tree.

    column_subsample : float, optional
        Subsample ratio of the columns in each iteration of tree
        construction.  Like row_subsample, this can also help prevent
        model overfitting.  Setting this to a value of 0.5 results in the
        model randomly sampling half of the columns to grow each tree.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. This is computed once per full iteration. Large
        differences in model accuracy between the training data and validation
        data is indicative of overfitting. The default value is 'auto'.

    verbose : boolean, optional
        Print progress information during training (if set to true).

    random_seed : int, optional
        Seeds random operations such as column and row subsampling, such that
        results are reproducible.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'accuracy', 'auc', 'log_loss'}

    kwargs : dict, optional
        Additional arguments for training the model.

        - ``model_checkpoint_path`` : str, default None
            If specified, checkpoint the model training to the given path every n iterations,
            where n is specified by ``model_checkpoint_interval``.
            For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
            set to ``/tmp/model_tmp``, the checkpoints will be saved into
            ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
            Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

        - ``model_checkpoint_interval`` : int, default 5
            If model_check_point_path is specified,
            save the model to the given path every n iterations.

        - ``resume_from_checkpoint`` : str, default None
            Continues training from a model checkpoint. The model must take
            exact the same training data as the checkpointed model.


    Returns
    -------
      out : RandomForestClassifier
          A trained random forest model for classification tasks.

    References
    ----------
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    RandomForestClassifier, turicreate.logistic_classifier.LogisticClassifier, turicreate.svm_classifier.SVMClassifier


    Examples
    --------

    .. sourcecode:: python

      >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
      >>> data = turicreate.SFrame.read_csv(url)

      >>> train, test = data.random_split(0.8)
      >>> model = turicreate.random_forest_classifier.create(train, target='label')

      >>> predictions = model.classify(test)
      >>> results = model.evaluate(test)
    """

    if random_seed is not None:
        kwargs['random_seed'] = random_seed
    if 'model_checkpoint_path' in kwargs:
        kwargs['model_checkpoint_path'] = _make_internal_url(kwargs['model_checkpoint_path'])
    if 'resume_from_checkpoint' in kwargs:
        kwargs['resume_from_checkpoint'] = _make_internal_url(kwargs['resume_from_checkpoint'])
    if 'num_trees' in kwargs:
        logger = _logging.getLogger(__name__)
        logger.warning("The `num_trees` keyword argument is deprecated. Please "
              "use the `max_iterations` argument instead. Any value provided "
              "for `num_trees` will be used in place of `max_iterations`.")
        max_iterations = kwargs['num_trees']
        del kwargs['num_trees']

    model = _sl.create(dataset = dataset,
                        target = target,
                        features = features,
                        model_name = 'random_forest_classifier',
                        max_iterations = max_iterations,
                        validation_set = validation_set,
                        class_weights = class_weights,
                        verbose = verbose,
                        metric = metric,
                        **kwargs)
    return RandomForestClassifier(model.__proxy__)
Пример #8
0
def create(
        dataset,
        target,
        features=None,
        l2_penalty=1e-2,
        l1_penalty=0.0,
        solver='auto',
        feature_rescaling=True,
        convergence_threshold=_DEFAULT_SOLVER_OPTIONS['convergence_threshold'],
        step_size=_DEFAULT_SOLVER_OPTIONS['step_size'],
        lbfgs_memory_level=_DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'],
        max_iterations=_DEFAULT_SOLVER_OPTIONS['max_iterations'],
        validation_set="auto",
        verbose=True):
    """
    Create a :class:`~turicreate.linear_regression.LinearRegression` to
    predict a scalar target variable as a linear function of one or more
    features. In addition to standard numeric and categorical types, features
    can also be extracted automatically from list- or dictionary-type SFrame
    columns.

    The linear regression module can be used for ridge regression, Lasso, and
    elastic net regression (see References for more detail on these methods). By
    default, this model has an l2 regularization weight of 0.01.

    Parameters
    ----------
    dataset : SFrame
        The dataset to use for training the model.

    target : string
        Name of the column containing the target variable.

    features : list[string], optional
        Names of the columns containing features. 'None' (the default) indicates
        that all columns except the target variable should be used as features.

        The features are columns in the input SFrame that can be of the
        following types:

        - *Numeric*: values of numeric type integer or float.

        - *Categorical*: values of type string.

        - *Array*: list of numeric (integer or float) values. Each list element
          is treated as a separate feature in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values
          Each key of a dictionary is treated as a separate feature and the
          value in the dictionary corresponds to the value of the feature.
          Dictionaries are ideal for representing sparse data.

        Columns of type *list* are not supported. Convert such feature
        columns to type array if all entries in the list are of numeric
        types. If the lists contain data of mixed types, separate
        them out into different columns.

    l2_penalty : float, optional
        Weight on the l2-regularizer of the model. The larger this weight, the
        more the model coefficients shrink toward 0. This introduces bias into
        the model but decreases variance, potentially leading to better
        predictions. The default value is 0.01; setting this parameter to 0
        corresponds to unregularized linear regression. See the ridge
        regression reference for more detail.

    l1_penalty : float, optional
        Weight on l1 regularization of the model. Like the l2 penalty, the
        higher the l1 penalty, the more the estimated coefficients shrink toward
        0. The l1 penalty, however, completely zeros out sufficiently small
        coefficients, automatically indicating features that are not useful for
        the model. The default weight of 0 prevents any features from being
        discarded. See the LASSO regression reference for more detail.

    solver : string, optional
        Solver to use for training the model. See the references for more detail
        on each solver.

        - *auto (default)*: automatically chooses the best solver for the data
          and model parameters.
        - *newton*: Newton-Raphson
        - *lbfgs*: limited memory BFGS
        - *fista*: accelerated gradient descent

        The model is trained using a carefully engineered collection of methods
        that are automatically picked based on the input data. The ``newton``
        method  works best for datasets with plenty of examples and few features
        (long datasets). Limited memory BFGS (``lbfgs``) is a robust solver for
        wide datasets (i.e datasets with many coefficients).  ``fista`` is the
        default solver for l1-regularized linear regression.  The solvers are
        all automatically tuned and the default options should function well.
        See the solver options guide for setting additional parameters for each
        of the solvers.

        See the user guide for additional details on how the solver is chosen.

    feature_rescaling : boolean, optional
        Feature rescaling is an important pre-processing step that ensures that
        all features are on the same scale. An l2-norm rescaling is performed
        to make sure that all features are of the same norm. Categorical
        features are also rescaled by rescaling the dummy variables that are
        used to represent them. The coefficients are returned in original scale
        of the problem. This process is particularly useful when features
        vary widely in their ranges.

    validation_set : SFrame, optional

        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    convergence_threshold : float, optional

      Convergence is tested using variation in the training objective. The
      variation in the training objective is calculated using the difference
      between the objective values between two steps. Consider reducing this
      below the default value (0.01) for a more accurately trained model.
      Beware of overfitting (i.e a model that works well only on the training
      data) if this parameter is set to a very low value.

    lbfgs_memory_level : int, optional

      The L-BFGS algorithm keeps track of gradient information from the
      previous ``lbfgs_memory_level`` iterations. The storage requirement for
      each of these gradients is the ``num_coefficients`` in the problem.
      Increasing the ``lbfgs_memory_level`` can help improve the quality of
      the model trained. Setting this to more than ``max_iterations`` has the
      same effect as setting it to ``max_iterations``.

    max_iterations : int, optional

      The maximum number of allowed passes through the data. More passes over
      the data can result in a more accurately trained model. Consider
      increasing this (the default value is 10) if the training accuracy is
      low and the *Grad-Norm* in the display is large.

    step_size : float, optional (fista only)

      The starting step size to use for the ``fista`` and ``gd`` solvers. The
      default is set to 1.0, this is an aggressive setting. If the first
      iteration takes a considerable amount of time, reducing this parameter
      may speed up model training.

    verbose : bool, optional
        If True, print progress updates.

    Returns
    -------
    out : LinearRegression
        A trained model of type
        :class:`~turicreate.linear_regression.LinearRegression`.

    See Also
    --------
    LinearRegression, turicreate.boosted_trees_regression.BoostedTreesRegression, turicreate.regression.create

    Notes
    -----
    - Categorical variables are encoded by creating dummy variables. For a
      variable with :math:`K` categories, the encoding creates :math:`K-1` dummy
      variables, while the first category encountered in the data is used as the
      baseline.

    - For prediction and evaluation of linear regression models with sparse
      dictionary inputs, new keys/columns that were not seen during training
      are silently ignored.

    - Any 'None' values in the data will result in an error being thrown.

    - A constant term is automatically added for the model intercept. This term
      is not regularized.

    - Standard errors on coefficients are only availiable when `solver=newton`
      or when the default `auto` solver option choses the newton method and if
      the number of examples in the training data is more than the number of
      coefficients. If standard errors cannot be estimated, a column of `None`
      values are returned.


    References
    ----------
    - Hoerl, A.E. and Kennard, R.W. (1970) `Ridge regression: Biased Estimation
      for Nonorthogonal Problems
      <http://amstat.tandfonline.com/doi/abs/10.1080/00401706.1970.10488634>`_.
      Technometrics 12(1) pp.55-67

    - Tibshirani, R. (1996) `Regression Shrinkage and Selection via the Lasso <h
      ttp://www.jstor.org/discover/10.2307/2346178?uid=3739256&uid=2&uid=4&sid=2
      1104169934983>`_. Journal of the Royal Statistical Society. Series B
      (Methodological) 58(1) pp.267-288.

    - Zhu, C., et al. (1997) `Algorithm 778: L-BFGS-B: Fortran subroutines for
      large-scale bound-constrained optimization
      <https://dl.acm.org/citation.cfm?id=279236>`_. ACM Transactions on
      Mathematical Software 23(4) pp.550-560.

    - Barzilai, J. and Borwein, J. `Two-Point Step Size Gradient Methods
      <http://imajna.oxfordjournals.org/content/8/1/141.short>`_. IMA Journal of
      Numerical Analysis 8(1) pp.141-148.

    - Beck, A. and Teboulle, M. (2009) `A Fast Iterative Shrinkage-Thresholding
      Algorithm for Linear Inverse Problems
      <http://epubs.siam.org/doi/abs/10.1137/080716542>`_. SIAM Journal on
      Imaging Sciences 2(1) pp.183-202.

    - Zhang, T. (2004) `Solving large scale linear prediction problems using
      stochastic gradient descent algorithms
      <https://dl.acm.org/citation.cfm?id=1015332>`_. ICML '04: Proceedings of
      the twenty-first international conference on Machine learning p.116.


    Examples
    --------

    Given an :class:`~turicreate.SFrame` ``sf`` with a list of columns
    [``feature_1`` ... ``feature_K``] denoting features and a target column
    ``target``, we can create a
    :class:`~turicreate.linear_regression.LinearRegression` as follows:

    >>> data =  turicreate.SFrame('https://static.turi.com/datasets/regression/houses.csv')

    >>> model = turicreate.linear_regression.create(data, target='price',
    ...                                  features=['bath', 'bedroom', 'size'])


    For ridge regression, we can set the ``l2_penalty`` parameter higher (the
    default is 0.01). For Lasso regression, we set the l1_penalty higher, and
    for elastic net, we set both to be higher.

    .. sourcecode:: python

      # Ridge regression
      >>> model_ridge = turicreate.linear_regression.create(data, 'price', l2_penalty=0.1)

      # Lasso
      >>> model_lasso = turicreate.linear_regression.create(data, 'price', l2_penalty=0.,
                                                                   l1_penalty=1.0)

      # Elastic net regression
      >>> model_enet  = turicreate.linear_regression.create(data, 'price', l2_penalty=0.5,
                                                                 l1_penalty=0.5)

    """

    # Regression model names.
    model_name = "regression_linear_regression"
    solver = solver.lower()

    model = _sl.create(dataset,
                       target,
                       model_name,
                       features=features,
                       validation_set=validation_set,
                       solver=solver,
                       verbose=verbose,
                       l2_penalty=l2_penalty,
                       l1_penalty=l1_penalty,
                       feature_rescaling=feature_rescaling,
                       convergence_threshold=convergence_threshold,
                       step_size=step_size,
                       lbfgs_memory_level=lbfgs_memory_level,
                       max_iterations=max_iterations)

    return LinearRegression(model.__proxy__)
Пример #9
0
def create(dataset,
           target,
           features=None,
           validation_set='auto',
           max_depth=6,
           min_loss_reduction=0.0,
           min_child_weight=0.1,
           verbose=True,
           random_seed=None,
           metric='auto',
           **kwargs):
    """
    Create a :class:`~turicreate.decision_tree_regression.DecisionTreeRegression` to predict
    a scalar target variable using one or more features. In addition to standard
    numeric and categorical types, features can also be extracted automatically
    from list- or dictionary-type SFrame columns.


    Parameters
    ----------
    dataset : SFrame
        A training dataset containing feature columns and a target column.
        Only numerical typed (int, float) target column is allowed.

    target : str
        The name of the column in ``dataset`` that is the prediction target.
        This column must have a numeric type.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, using all columns.

    validation_set : SFrame, optional
        The validation set that is used to watch the validation result as
        boosting progress.

    max_depth : float, optional
        Maximum depth of a tree. Must be at least 1.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition/split a
        node during the tree learning phase. Larger (more positive) values
        can help prevent overfitting by avoiding splits that do not
        sufficiently reduce the loss function.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    verbose : boolean, optional
        If True, print progress information during training.

    random_seed: int, optional
        Seeds random operations such as column and row subsampling, such that
        results are reproducible.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'rmse', 'max_error'}

    Returns
    -------
      out : DecisionTreeRegression
          A trained decision tree model

    References
    ----------
    - `Wikipedia - Gradient tree boosting
      <http://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting>`_
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    DecisionTreeRegression, turicreate.linear_regression.LinearRegression, turicreate.regression.create

    Examples
    --------

    Setup the data:

    >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
    >>> data = turicreate.SFrame.read_csv(url)
    >>> data['label'] = data['label'] == 'p'

    Split the data into training and test data:

    >>> train, test = data.random_split(0.8)

    Create the model:

    >>> model = turicreate.decision_tree_regression.create(train, target='label')

    Make predictions and evaluate the model:

    >>> predictions = model.predict(test)
    >>> results = model.evaluate(test)

    """

    if random_seed is not None:
        kwargs['random_seed'] = random_seed

    model = _sl.create(dataset=dataset,
                       target=target,
                       features=features,
                       model_name='decision_tree_regression',
                       validation_set=validation_set,
                       max_depth=max_depth,
                       min_loss_reduction=min_loss_reduction,
                       min_child_weight=min_child_weight,
                       verbose=verbose,
                       **kwargs)
    return DecisionTreeRegression(model.__proxy__)