예제 #1
0
class LDA(AmazonAlgorithmEstimatorBase):
    """An unsupervised learning algorithm attempting to describe data as distinct categories.

    LDA is most commonly used to discover a
    user-specified number of topics shared by documents within a text corpus. Here each
    observation is a document, the features are the presence (or occurrence count) of each
    word, and the categories are the topics.
    """

    repo_name = "lda"
    repo_version = 1

    num_topics = hp("num_topics", gt(0), "An integer greater than zero", int)
    alpha0 = hp("alpha0", gt(0), "A positive float", float)
    max_restarts = hp("max_restarts", gt(0), "An integer greater than zero",
                      int)
    max_iterations = hp("max_iterations", gt(0),
                        "An integer greater than zero", int)
    tol = hp("tol", gt(0), "A positive float", float)

    def __init__(self,
                 role,
                 instance_type=None,
                 num_topics=None,
                 alpha0=None,
                 max_restarts=None,
                 max_iterations=None,
                 tol=None,
                 **kwargs):
        """Latent Dirichlet Allocation (LDA) is :class:`Estimator` used for unsupervised learning.

        Amazon SageMaker Latent Dirichlet Allocation is an unsupervised
        learning algorithm that attempts to describe a set of observations as a
        mixture of distinct categories. LDA is most commonly used to discover a
        user-specified number of topics shared by documents within a text
        corpus. Here each observation is a document, the features are the
        presence (or occurrence count) of each word, and the categories are the
        topics.

        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`.
        It requires Amazon :class:`~sagemaker.amazon.record_pb2.Record` protobuf
        serialized data to be stored in S3. There is an utility
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set`
        that can be used to upload data to S3 and creates
        :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed to
        the `fit` call.

        To learn more about the Amazon protobuf Record class and how to
        prepare bulk data in this format, please consult AWS technical
        documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        After this Estimator is fit, model data is stored in S3. The model
        may be deployed to an Amazon SageMaker Endpoint by invoking
        :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as
        deploying an Endpoint, deploy returns a
        :class:`~sagemaker.amazon.lda.LDAPredictor` object that can be used for
        inference calls using the trained model hosted in the SageMaker
        Endpoint.

        LDA Estimators can be configured by setting hyperparameters. The
        available hyperparameters for LDA are documented below.

        For further information on the AWS LDA algorithm, please consult AWS
        technical documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/lda.html

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon
                SageMaker training jobs and APIs that create Amazon SageMaker
                endpoints use this role to access training data and model
                artifacts. After the endpoint is created, the inference code
                might use the IAM role, if accessing AWS resource.
            instance_type (str): Type of EC2 instance to use for training,
                for example, 'ml.c4.xlarge'.
            num_topics (int): The number of topics for LDA to find within the
                data.
            alpha0 (float): Optional. Initial guess for the concentration
                parameter
            max_restarts (int): Optional. The number of restarts to perform
                during the Alternating Least Squares (ALS) spectral
                decomposition phase of the algorithm.
            max_iterations (int): Optional. The maximum number of iterations to
                perform during the ALS phase of the algorithm.
            tol (float): Optional. Target error tolerance for the ALS phase of
                the algorithm.
            **kwargs: base class keyword argument values.

        .. tip::

            You can find additional parameters for initializing this class at
            :class:`~sagemaker.estimator.amazon_estimator.AmazonAlgorithmEstimatorBase` and
            :class:`~sagemaker.estimator.EstimatorBase`.
        """
        # this algorithm only supports single instance training
        if kwargs.pop("instance_count", 1) != 1:
            print(
                "LDA only supports single instance training. Defaulting to 1 {}."
                .format(instance_type))

        super(LDA, self).__init__(role, 1, instance_type, **kwargs)
        self.num_topics = num_topics
        self.alpha0 = alpha0
        self.max_restarts = max_restarts
        self.max_iterations = max_iterations
        self.tol = tol

    def create_model(self, vpc_config_override=VPC_CONFIG_DEFAULT, **kwargs):
        """Return a :class:`~sagemaker.amazon.LDAModel`.

        It references the latest s3 model data produced by this Estimator.

        Args:
            vpc_config_override (dict[str, list[str]]): Optional override for
                VpcConfig set on the model.
                Default: use subnets and security groups from this Estimator.
                * 'Subnets' (list[str]): List of subnet ids.
                * 'SecurityGroupIds' (list[str]): List of security group ids.
            **kwargs: Additional kwargs passed to the LDAModel constructor.
        """
        return LDAModel(self.model_data,
                        self.role,
                        sagemaker_session=self.sagemaker_session,
                        vpc_config=self.get_vpc_config(vpc_config_override),
                        **kwargs)

    def _prepare_for_training(  # pylint: disable=signature-differs
            self,
            records,
            mini_batch_size,
            job_name=None):
        # mini_batch_size is required, prevent explicit calls with None
        """Placeholder docstring"""
        if mini_batch_size is None:
            raise ValueError("mini_batch_size must be set")

        super(LDA, self)._prepare_for_training(records,
                                               mini_batch_size=mini_batch_size,
                                               job_name=job_name)
예제 #2
0
class FactorizationMachines(AmazonAlgorithmEstimatorBase):
    """Placeholder docstring"""

    repo_name = "factorization-machines"
    repo_version = 1

    num_factors = hp("num_factors", gt(0), "An integer greater than zero", int)
    predictor_type = hp(
        "predictor_type",
        isin("binary_classifier", "regressor"),
        'Value "binary_classifier" or "regressor"',
        str,
    )
    epochs = hp("epochs", gt(0), "An integer greater than 0", int)
    clip_gradient = hp("clip_gradient", (), "A float value", float)
    eps = hp("eps", (), "A float value", float)
    rescale_grad = hp("rescale_grad", (), "A float value", float)
    bias_lr = hp("bias_lr", ge(0), "A non-negative float", float)
    linear_lr = hp("linear_lr", ge(0), "A non-negative float", float)
    factors_lr = hp("factors_lr", ge(0), "A non-negative float", float)
    bias_wd = hp("bias_wd", ge(0), "A non-negative float", float)
    linear_wd = hp("linear_wd", ge(0), "A non-negative float", float)
    factors_wd = hp("factors_wd", ge(0), "A non-negative float", float)
    bias_init_method = hp(
        "bias_init_method",
        isin("normal", "uniform", "constant"),
        'Value "normal", "uniform" or "constant"',
        str,
    )
    bias_init_scale = hp("bias_init_scale", ge(0), "A non-negative float",
                         float)
    bias_init_sigma = hp("bias_init_sigma", ge(0), "A non-negative float",
                         float)
    bias_init_value = hp("bias_init_value", (), "A float value", float)
    linear_init_method = hp(
        "linear_init_method",
        isin("normal", "uniform", "constant"),
        'Value "normal", "uniform" or "constant"',
        str,
    )
    linear_init_scale = hp("linear_init_scale", ge(0), "A non-negative float",
                           float)
    linear_init_sigma = hp("linear_init_sigma", ge(0), "A non-negative float",
                           float)
    linear_init_value = hp("linear_init_value", (), "A float value", float)
    factors_init_method = hp(
        "factors_init_method",
        isin("normal", "uniform", "constant"),
        'Value "normal", "uniform" or "constant"',
        str,
    )
    factors_init_scale = hp("factors_init_scale", ge(0),
                            "A non-negative float", float)
    factors_init_sigma = hp("factors_init_sigma", ge(0),
                            "A non-negative float", float)
    factors_init_value = hp("factors_init_value", (), "A float value", float)

    def __init__(self,
                 role,
                 instance_count,
                 instance_type,
                 num_factors,
                 predictor_type,
                 epochs=None,
                 clip_gradient=None,
                 eps=None,
                 rescale_grad=None,
                 bias_lr=None,
                 linear_lr=None,
                 factors_lr=None,
                 bias_wd=None,
                 linear_wd=None,
                 factors_wd=None,
                 bias_init_method=None,
                 bias_init_scale=None,
                 bias_init_sigma=None,
                 bias_init_value=None,
                 linear_init_method=None,
                 linear_init_scale=None,
                 linear_init_sigma=None,
                 linear_init_value=None,
                 factors_init_method=None,
                 factors_init_scale=None,
                 factors_init_sigma=None,
                 factors_init_value=None,
                 **kwargs):
        """Factorization Machines is :class:`Estimator` for general-purpose
        supervised learning.

        Amazon SageMaker Factorization Machines is a general-purpose
        supervised learning algorithm that you can use for both classification
        and regression tasks. It is an extension of a linear model that is
        designed to parsimoniously capture interactions between features within
        high dimensional sparse datasets.

        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`.
        It requires Amazon :class:`~sagemaker.amazon.record_pb2.Record` protobuf
        serialized data to be stored in S3. There is an utility
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set`
        that can be used to upload data to S3 and creates
        :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed to
        the `fit` call.

        To learn more about the Amazon protobuf Record class and how to
        prepare bulk data in this format, please consult AWS technical
        documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        After this Estimator is fit, model data is stored in S3. The model
        may be deployed to an Amazon SageMaker Endpoint by invoking
        :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as
        deploying an Endpoint, deploy returns a
        :class:`~sagemaker.amazon.pca.FactorizationMachinesPredictor` object
        that can be used for inference calls using the trained model hosted in
        the SageMaker Endpoint.

        FactorizationMachines Estimators can be configured by setting
        hyperparameters. The available hyperparameters for FactorizationMachines
        are documented below.

        For further information on the AWS FactorizationMachines algorithm,
        please consult AWS technical documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/fact-machines.html

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon
                SageMaker training jobs and APIs that create Amazon SageMaker
                endpoints use this role to access training data and model
                artifacts. After the endpoint is created, the inference code
                might use the IAM role, if accessing AWS resource.
            instance_count (int): Number of Amazon EC2 instances to use
                for training.
            instance_type (str): Type of EC2 instance to use for training,
                for example, 'ml.c4.xlarge'.
            num_factors (int): Dimensionality of factorization.
            predictor_type (str): Type of predictor 'binary_classifier' or
                'regressor'.
            epochs (int): Number of training epochs to run.
            clip_gradient (float): Optimizer parameter. Clip the gradient by
                projecting onto the box [-clip_gradient, +clip_gradient]
            eps (float): Optimizer parameter. Small value to avoid division by
                0.
            rescale_grad (float): Optimizer parameter. If set, multiplies the
                gradient with rescale_grad before updating. Often choose to be
                1.0/batch_size.
            bias_lr (float): Non-negative learning rate for the bias term.
            linear_lr (float): Non-negative learning rate for linear terms.
            factors_lr (float): Noon-negative learning rate for factorization
                terms.
            bias_wd (float): Non-negative weight decay for the bias term.
            linear_wd (float): Non-negative weight decay for linear terms.
            factors_wd (float): Non-negative weight decay for factorization
                terms.
            bias_init_method (string): Initialization method for the bias term:
                'normal', 'uniform' or 'constant'.
            bias_init_scale (float): Non-negative range for initialization of
                the bias term that takes effect when bias_init_method parameter
                is 'uniform'
            bias_init_sigma (float): Non-negative standard deviation for
                initialization of the bias term that takes effect when
                bias_init_method parameter is 'normal'.
            bias_init_value (float): Initial value of the bias term that takes
                effect when bias_init_method parameter is 'constant'.
            linear_init_method (string): Initialization method for linear term:
                'normal', 'uniform' or 'constant'.
            linear_init_scale (float): Non-negative range for initialization of
                linear terms that takes effect when linear_init_method parameter
                is 'uniform'.
            linear_init_sigma (float): Non-negative standard deviation for
                initialization of linear terms that takes effect when
                linear_init_method parameter is 'normal'.
            linear_init_value (float): Initial value of linear terms that takes
                effect when linear_init_method parameter is 'constant'.
            factors_init_method (string): Initialization method for
                factorization term: 'normal', 'uniform' or 'constant'.
            factors_init_scale (float): Non-negative range for initialization of
                factorization terms that takes effect when factors_init_method
                parameter is 'uniform'.
            factors_init_sigma (float): Non-negative standard deviation for
                initialization of factorization terms that takes effect when
                factors_init_method parameter is 'normal'.
            factors_init_value (float): Initial value of factorization terms
                that takes effect when factors_init_method parameter is
                'constant'.
            **kwargs: base class keyword argument values.

        .. tip::

            You can find additional parameters for initializing this class at
            :class:`~sagemaker.estimator.amazon_estimator.AmazonAlgorithmEstimatorBase` and
            :class:`~sagemaker.estimator.EstimatorBase`.
        """
        super(FactorizationMachines, self).__init__(role, instance_count,
                                                    instance_type, **kwargs)

        self.num_factors = num_factors
        self.predictor_type = predictor_type
        self.epochs = epochs
        self.clip_gradient = clip_gradient
        self.eps = eps
        self.rescale_grad = rescale_grad
        self.bias_lr = bias_lr
        self.linear_lr = linear_lr
        self.factors_lr = factors_lr
        self.bias_wd = bias_wd
        self.linear_wd = linear_wd
        self.factors_wd = factors_wd
        self.bias_init_method = bias_init_method
        self.bias_init_scale = bias_init_scale
        self.bias_init_sigma = bias_init_sigma
        self.bias_init_value = bias_init_value
        self.linear_init_method = linear_init_method
        self.linear_init_scale = linear_init_scale
        self.linear_init_sigma = linear_init_sigma
        self.linear_init_value = linear_init_value
        self.factors_init_method = factors_init_method
        self.factors_init_scale = factors_init_scale
        self.factors_init_sigma = factors_init_sigma
        self.factors_init_value = factors_init_value

    def create_model(self, vpc_config_override=VPC_CONFIG_DEFAULT, **kwargs):
        """Return a :class:`~sagemaker.amazon.FactorizationMachinesModel`
        referencing the latest s3 model data produced by this Estimator.

        Args:
            vpc_config_override (dict[str, list[str]]): Optional override for VpcConfig set on
                the model. Default: use subnets and security groups from this Estimator.
                * 'Subnets' (list[str]): List of subnet ids.
                * 'SecurityGroupIds' (list[str]): List of security group ids.
            **kwargs: Additional kwargs passed to the FactorizationMachinesModel constructor.
        """
        return FactorizationMachinesModel(
            self.model_data,
            self.role,
            sagemaker_session=self.sagemaker_session,
            vpc_config=self.get_vpc_config(vpc_config_override),
            **kwargs)
예제 #3
0
class Object2Vec(AmazonAlgorithmEstimatorBase):
    """A general-purpose neural embedding algorithm that is highly customizable.

    It can learn low-dimensional dense embeddings of high-dimensional objects. The embeddings
    are learned in a way that preserves the semantics of the relationship between pairs of
    objects in the original space in the embedding space.
    """

    repo_name = "object2vec"
    repo_version = 1
    MINI_BATCH_SIZE = 32

    enc_dim = hp("enc_dim", (ge(4), le(10000)), "An integer in [4, 10000]", int)
    mini_batch_size = hp("mini_batch_size", (ge(1), le(10000)), "An integer in [1, 10000]", int)
    epochs = hp("epochs", (ge(1), le(100)), "An integer in [1, 100]", int)
    early_stopping_patience = hp(
        "early_stopping_patience", (ge(1), le(5)), "An integer in [1, 5]", int
    )
    early_stopping_tolerance = hp(
        "early_stopping_tolerance", (ge(1e-06), le(0.1)), "A float in [1e-06, 0.1]", float
    )
    dropout = hp("dropout", (ge(0.0), le(1.0)), "A float in [0.0, 1.0]", float)
    weight_decay = hp("weight_decay", (ge(0.0), le(10000.0)), "A float in [0.0, 10000.0]", float)
    bucket_width = hp("bucket_width", (ge(0), le(100)), "An integer in [0, 100]", int)
    num_classes = hp("num_classes", (ge(2), le(30)), "An integer in [2, 30]", int)
    mlp_layers = hp("mlp_layers", (ge(1), le(10)), "An integer in [1, 10]", int)
    mlp_dim = hp("mlp_dim", (ge(2), le(10000)), "An integer in [2, 10000]", int)
    mlp_activation = hp(
        "mlp_activation", isin("tanh", "relu", "linear"), 'One of "tanh", "relu", "linear"', str
    )
    output_layer = hp(
        "output_layer",
        isin("softmax", "mean_squared_error"),
        'One of "softmax", "mean_squared_error"',
        str,
    )
    optimizer = hp(
        "optimizer",
        isin("adagrad", "adam", "rmsprop", "sgd", "adadelta"),
        'One of "adagrad", "adam", "rmsprop", "sgd", "adadelta"',
        str,
    )
    learning_rate = hp("learning_rate", (ge(1e-06), le(1.0)), "A float in [1e-06, 1.0]", float)

    negative_sampling_rate = hp(
        "negative_sampling_rate", (ge(0), le(100)), "An integer in [0, 100]", int
    )
    comparator_list = hp(
        "comparator_list",
        _list_check_subset(["hadamard", "concat", "abs_diff"]),
        'Comma-separated of hadamard, concat, abs_diff. E.g. "hadamard,abs_diff"',
        str,
    )
    tied_token_embedding_weight = hp(
        "tied_token_embedding_weight", (), "Either True or False", bool
    )
    token_embedding_storage_type = hp(
        "token_embedding_storage_type",
        isin("dense", "row_sparse"),
        'One of "dense", "row_sparse"',
        str,
    )

    enc0_network = hp(
        "enc0_network",
        isin("hcnn", "bilstm", "pooled_embedding"),
        'One of "hcnn", "bilstm", "pooled_embedding"',
        str,
    )
    enc1_network = hp(
        "enc1_network",
        isin("hcnn", "bilstm", "pooled_embedding", "enc0"),
        'One of "hcnn", "bilstm", "pooled_embedding", "enc0"',
        str,
    )
    enc0_cnn_filter_width = hp("enc0_cnn_filter_width", (ge(1), le(9)), "An integer in [1, 9]", int)
    enc1_cnn_filter_width = hp("enc1_cnn_filter_width", (ge(1), le(9)), "An integer in [1, 9]", int)
    enc0_max_seq_len = hp("enc0_max_seq_len", (ge(1), le(5000)), "An integer in [1, 5000]", int)
    enc1_max_seq_len = hp("enc1_max_seq_len", (ge(1), le(5000)), "An integer in [1, 5000]", int)
    enc0_token_embedding_dim = hp(
        "enc0_token_embedding_dim", (ge(2), le(1000)), "An integer in [2, 1000]", int
    )
    enc1_token_embedding_dim = hp(
        "enc1_token_embedding_dim", (ge(2), le(1000)), "An integer in [2, 1000]", int
    )
    enc0_vocab_size = hp("enc0_vocab_size", (ge(2), le(3000000)), "An integer in [2, 3000000]", int)
    enc1_vocab_size = hp("enc1_vocab_size", (ge(2), le(3000000)), "An integer in [2, 3000000]", int)
    enc0_layers = hp("enc0_layers", (ge(1), le(4)), "An integer in [1, 4]", int)
    enc1_layers = hp("enc1_layers", (ge(1), le(4)), "An integer in [1, 4]", int)
    enc0_freeze_pretrained_embedding = hp(
        "enc0_freeze_pretrained_embedding", (), "Either True or False", bool
    )
    enc1_freeze_pretrained_embedding = hp(
        "enc1_freeze_pretrained_embedding", (), "Either True or False", bool
    )

    def __init__(
        self,
        role,
        instance_count,
        instance_type,
        epochs,
        enc0_max_seq_len,
        enc0_vocab_size,
        enc_dim=None,
        mini_batch_size=None,
        early_stopping_patience=None,
        early_stopping_tolerance=None,
        dropout=None,
        weight_decay=None,
        bucket_width=None,
        num_classes=None,
        mlp_layers=None,
        mlp_dim=None,
        mlp_activation=None,
        output_layer=None,
        optimizer=None,
        learning_rate=None,
        negative_sampling_rate=None,
        comparator_list=None,
        tied_token_embedding_weight=None,
        token_embedding_storage_type=None,
        enc0_network=None,
        enc1_network=None,
        enc0_cnn_filter_width=None,
        enc1_cnn_filter_width=None,
        enc1_max_seq_len=None,
        enc0_token_embedding_dim=None,
        enc1_token_embedding_dim=None,
        enc1_vocab_size=None,
        enc0_layers=None,
        enc1_layers=None,
        enc0_freeze_pretrained_embedding=None,
        enc1_freeze_pretrained_embedding=None,
        **kwargs
    ):
        """Object2Vec is :class:`Estimator` used for anomaly detection.

        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`.
        There is an utility
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set`
        that can be used to upload data to S3 and creates
        :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed to
        the `fit` call.

        After this Estimator is fit, model data is stored in S3. The model
        may be deployed to an Amazon SageMaker Endpoint by invoking
        :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as
        deploying an Endpoint, deploy returns a
        :class:`~sagemaker.amazon.Predictor` object that can be used for
        inference calls using the trained model hosted in the SageMaker
        Endpoint.

        Object2Vec Estimators can be configured by setting hyperparameters.
        The available hyperparameters for Object2Vec are documented below.

        For further information on the AWS Object2Vec algorithm, please
        consult AWS technical documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/object2vec.html

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon
                SageMaker training jobs and APIs that create Amazon SageMaker
                endpoints use this role to access training data and model
                artifacts. After the endpoint is created, the inference code
                might use the IAM role, if accessing AWS resource.
            instance_count (int): Number of Amazon EC2 instances to use
                for training.
            instance_type (str): Type of EC2 instance to use for training,
                for example, 'ml.c4.xlarge'.
            epochs (int): Total number of epochs for SGD training
            enc0_max_seq_len (int): Maximum sequence length
            enc0_vocab_size (int): Vocabulary size of tokens
            enc_dim (int): Optional. Dimension of the output of the embedding
                layer
            mini_batch_size (int): Optional. mini batch size for SGD training
            early_stopping_patience (int): Optional. The allowed number of
                consecutive epochs without improvement before early stopping is
                applied
            early_stopping_tolerance (float): Optional. The value used to
                determine whether the algorithm has made improvement between two
                consecutive epochs for early stopping
            dropout (float): Optional. Dropout probability on network layers
            weight_decay (float): Optional. Weight decay parameter during
                optimization
            bucket_width (int): Optional. The allowed difference between data
                sequence length when bucketing is enabled
            num_classes (int): Optional. Number of classes for classification
                training (ignored for regression problems)
            mlp_layers (int): Optional. Number of MLP layers in the network
            mlp_dim (int): Optional. Dimension of the output of MLP layer
            mlp_activation (str): Optional. Type of activation function for the
                MLP layer
            output_layer (str): Optional. Type of output layer
            optimizer (str): Optional. Type of optimizer for training
            learning_rate (float): Optional. Learning rate for SGD training
            negative_sampling_rate (int): Optional. Negative sampling rate
            comparator_list (str): Optional. Customization of comparator
                operator
            tied_token_embedding_weight (bool): Optional. Tying of token
                embedding layer weight
            token_embedding_storage_type (str): Optional. Type of token
                embedding storage
            enc0_network (str): Optional. Network model of encoder "enc0"
            enc1_network (str): Optional. Network model of encoder "enc1"
            enc0_cnn_filter_width (int): Optional. CNN filter width
            enc1_cnn_filter_width (int): Optional. CNN filter width
            enc1_max_seq_len (int): Optional. Maximum sequence length
            enc0_token_embedding_dim (int): Optional. Output dimension of token
                embedding layer
            enc1_token_embedding_dim (int): Optional. Output dimension of token
                embedding layer
            enc1_vocab_size (int): Optional. Vocabulary size of tokens
            enc0_layers (int): Optional. Number of layers in encoder
            enc1_layers (int): Optional. Number of layers in encoder
            enc0_freeze_pretrained_embedding (bool): Optional. Freeze pretrained
                embedding weights
            enc1_freeze_pretrained_embedding (bool): Optional. Freeze pretrained
                embedding weights
            **kwargs: base class keyword argument values.

        .. tip::

            You can find additional parameters for initializing this class at
            :class:`~sagemaker.estimator.amazon_estimator.AmazonAlgorithmEstimatorBase` and
            :class:`~sagemaker.estimator.EstimatorBase`.
        """

        super(Object2Vec, self).__init__(role, instance_count, instance_type, **kwargs)

        self.enc_dim = enc_dim
        self.mini_batch_size = mini_batch_size
        self.epochs = epochs
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_tolerance = early_stopping_tolerance
        self.dropout = dropout
        self.weight_decay = weight_decay
        self.bucket_width = bucket_width
        self.num_classes = num_classes
        self.mlp_layers = mlp_layers
        self.mlp_dim = mlp_dim
        self.mlp_activation = mlp_activation
        self.output_layer = output_layer
        self.optimizer = optimizer
        self.learning_rate = learning_rate

        self.negative_sampling_rate = negative_sampling_rate
        self.comparator_list = comparator_list
        self.tied_token_embedding_weight = tied_token_embedding_weight
        self.token_embedding_storage_type = token_embedding_storage_type

        self.enc0_network = enc0_network
        self.enc1_network = enc1_network
        self.enc0_cnn_filter_width = enc0_cnn_filter_width
        self.enc1_cnn_filter_width = enc1_cnn_filter_width
        self.enc0_max_seq_len = enc0_max_seq_len
        self.enc1_max_seq_len = enc1_max_seq_len
        self.enc0_token_embedding_dim = enc0_token_embedding_dim
        self.enc1_token_embedding_dim = enc1_token_embedding_dim
        self.enc0_vocab_size = enc0_vocab_size
        self.enc1_vocab_size = enc1_vocab_size
        self.enc0_layers = enc0_layers
        self.enc1_layers = enc1_layers
        self.enc0_freeze_pretrained_embedding = enc0_freeze_pretrained_embedding
        self.enc1_freeze_pretrained_embedding = enc1_freeze_pretrained_embedding

    def create_model(self, vpc_config_override=VPC_CONFIG_DEFAULT, **kwargs):
        """Return a :class:`~sagemaker.amazon.Object2VecModel` referencing the
        latest s3 model data produced by this Estimator.

        Args:
            vpc_config_override (dict[str, list[str]]): Optional override for VpcConfig set on
                the model. Default: use subnets and security groups from this Estimator.
                * 'Subnets' (list[str]): List of subnet ids.
                * 'SecurityGroupIds' (list[str]): List of security group ids.
            **kwargs: Additional kwargs passed to the Object2VecModel constructor.
        """
        return Object2VecModel(
            self.model_data,
            self.role,
            sagemaker_session=self.sagemaker_session,
            vpc_config=self.get_vpc_config(vpc_config_override),
            **kwargs
        )

    def _prepare_for_training(self, records, mini_batch_size=None, job_name=None):
        """
        Args:
            records:
            mini_batch_size:
            job_name:
        """
        if mini_batch_size is None:
            mini_batch_size = self.MINI_BATCH_SIZE

        super(Object2Vec, self)._prepare_for_training(
            records, mini_batch_size=mini_batch_size, job_name=job_name
        )
class AmazonAlgorithmEstimatorBase(EstimatorBase):
    """Base class for Amazon first-party Estimator implementations. This class
    isn't intended to be instantiated directly.
    """

    feature_dim = hp("feature_dim", validation.gt(0), data_type=int)
    mini_batch_size = hp("mini_batch_size", validation.gt(0), data_type=int)
    repo_name = None
    repo_version = None

    def __init__(
        self,
        role,
        instance_count,
        instance_type,
        data_location=None,
        enable_network_isolation=False,
        **kwargs
    ):
        """Initialize an AmazonAlgorithmEstimatorBase.

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon
                SageMaker training jobs and APIs that create Amazon SageMaker
                endpoints use this role to access training data and model
                artifacts. After the endpoint is created, the inference code
                might use the IAM role, if it needs to access an AWS resource.
            instance_count (int): Number of Amazon EC2 instances to use
                for training.
            instance_type (str): Type of EC2 instance to use for training,
                for example, 'ml.c4.xlarge'.
            data_location (str or None): The s3 prefix to upload RecordSet
                objects to, expressed as an S3 url. For example
                "s3://example-bucket/some-key-prefix/". Objects will be saved in
                a unique sub-directory of the specified location. If None, a
                default data location will be used.
            enable_network_isolation (bool): Specifies whether container will
                run in network isolation mode. Network isolation mode restricts
                the container access to outside networks (such as the internet).
                Also known as internet-free mode (default: ``False``).
            **kwargs: Additional parameters passed to
                :class:`~sagemaker.estimator.EstimatorBase`.

        .. tip::

            You can find additional parameters for initializing this class at
            :class:`~sagemaker.estimator.EstimatorBase`.
        """
        super(AmazonAlgorithmEstimatorBase, self).__init__(
            role,
            instance_count,
            instance_type,
            enable_network_isolation=enable_network_isolation,
            **kwargs
        )

        data_location = data_location or "s3://{}/sagemaker-record-sets/".format(
            self.sagemaker_session.default_bucket()
        )
        self._data_location = data_location

    def training_image_uri(self):
        """Placeholder docstring"""
        return image_uris.retrieve(
            self.repo_name,
            self.sagemaker_session.boto_region_name,
            version=self.repo_version,
        )

    def hyperparameters(self):
        """Placeholder docstring"""
        return hp.serialize_all(self)

    @property
    def data_location(self):
        """Placeholder docstring"""
        return self._data_location

    @data_location.setter
    def data_location(self, data_location):
        """
        Args:
            data_location:
        """
        if not data_location.startswith("s3://"):
            raise ValueError(
                'Expecting an S3 URL beginning with "s3://". Got "{}"'.format(data_location)
            )
        if data_location[-1] != "/":
            data_location = data_location + "/"
        self._data_location = data_location

    @classmethod
    def _prepare_init_params_from_job_description(cls, job_details, model_channel_name=None):
        """Convert the job description to init params that can be handled by the
        class constructor

        Args:
            job_details: the returned job details from a describe_training_job
                API call.
            model_channel_name (str): Name of the channel where pre-trained
                model data will be downloaded.

        Returns:
            dictionary: The transformed init_params
        """
        init_params = super(
            AmazonAlgorithmEstimatorBase, cls
        )._prepare_init_params_from_job_description(job_details, model_channel_name)

        # The hyperparam names may not be the same as the class attribute that holds them,
        # for instance: local_lloyd_init_method is called local_init_method. We need to map these
        # and pass the correct name to the constructor.
        for attribute, value in cls.__dict__.items():
            if isinstance(value, hp):
                if value.name in init_params["hyperparameters"]:
                    init_params[attribute] = init_params["hyperparameters"][value.name]

        del init_params["hyperparameters"]
        del init_params["image_uri"]
        return init_params

    def prepare_workflow_for_training(self, records=None, mini_batch_size=None, job_name=None):
        """Calls _prepare_for_training. Used when setting up a workflow.

        Args:
            records (:class:`~RecordSet`): The records to train this ``Estimator`` on.
            mini_batch_size (int or None): The size of each mini-batch to use when
                training. If ``None``, a default value will be used.
            job_name (str): Name of the training job to be created. If not
                specified, one is generated, using the base name given to the
                constructor if applicable.
        """
        self._prepare_for_training(
            records=records, mini_batch_size=mini_batch_size, job_name=job_name
        )

    def _prepare_for_training(self, records, mini_batch_size=None, job_name=None):
        """Set hyperparameters needed for training.

        Args:
            records (:class:`~RecordSet`): The records to train this ``Estimator`` on.
            mini_batch_size (int or None): The size of each mini-batch to use when
                training. If ``None``, a default value will be used.
            job_name (str): Name of the training job to be created. If not
                specified, one is generated, using the base name given to the
                constructor if applicable.
        """
        super(AmazonAlgorithmEstimatorBase, self)._prepare_for_training(job_name=job_name)

        feature_dim = None

        if isinstance(records, list):
            for record in records:
                if record.channel == "train":
                    feature_dim = record.feature_dim
                    break
            if feature_dim is None:
                raise ValueError("Must provide train channel.")
        else:
            feature_dim = records.feature_dim

        self.feature_dim = feature_dim
        self.mini_batch_size = mini_batch_size

    def fit(
        self,
        records,
        mini_batch_size=None,
        wait=True,
        logs=True,
        job_name=None,
        experiment_config=None,
    ):
        """Fit this Estimator on serialized Record objects, stored in S3.

        ``records`` should be an instance of :class:`~RecordSet`. This
        defines a collection of S3 data files to train this ``Estimator`` on.

        Training data is expected to be encoded as dense or sparse vectors in
        the "values" feature on each Record. If the data is labeled, the label
        is expected to be encoded as a list of scalas in the "values" feature of
        the Record label.

        More information on the Amazon Record format is available at:
        https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        See :meth:`~AmazonAlgorithmEstimatorBase.record_set` to construct a
        ``RecordSet`` object from :class:`~numpy.ndarray` arrays.

        Args:
            records (:class:`~RecordSet`): The records to train this ``Estimator`` on
            mini_batch_size (int or None): The size of each mini-batch to use
                when training. If ``None``, a default value will be used.
            wait (bool): Whether the call should wait until the job completes
                (default: True).
            logs (bool): Whether to show the logs produced by the job. Only
                meaningful when wait is True (default: True).
            job_name (str): Training job name. If not specified, the estimator
                generates a default job name, based on the training image name
                and current timestamp.
            experiment_config (dict[str, str]): Experiment management configuration.
                Dictionary contains three optional keys, 'ExperimentName',
                'TrialName', and 'TrialComponentName'
                (default: ``None``).
        """
        self._prepare_for_training(records, job_name=job_name, mini_batch_size=mini_batch_size)

        self.latest_training_job = _TrainingJob.start_new(
            self, records, experiment_config=experiment_config
        )
        if wait:
            self.latest_training_job.wait(logs=logs)

    def record_set(self, train, labels=None, channel="train", encrypt=False):
        """Build a :class:`~RecordSet` from a numpy :class:`~ndarray` matrix and
        label vector.

        For the 2D ``ndarray`` ``train``, each row is converted to a
        :class:`~Record` object. The vector is stored in the "values" entry of
        the ``features`` property of each Record. If ``labels`` is not None,
        each corresponding label is assigned to the "values" entry of the
        ``labels`` property of each Record.

        The collection of ``Record`` objects are protobuf serialized and
        uploaded to new S3 locations. A manifest file is generated containing
        the list of objects created and also stored in S3.

        The number of S3 objects created is controlled by the
        ``instance_count`` property on this Estimator. One S3 object is
        created per training instance.

        Args:
            train (numpy.ndarray): A 2D numpy array of training data.
            labels (numpy.ndarray): A 1D numpy array of labels. Its length must
                be equal to the number of rows in ``train``.
            channel (str): The SageMaker TrainingJob channel this RecordSet
                should be assigned to.
            encrypt (bool): Specifies whether the objects uploaded to S3 are
                encrypted on the server side using AES-256 (default: ``False``).

        Returns:
            RecordSet: A RecordSet referencing the encoded, uploading training
            and label data.
        """
        s3 = self.sagemaker_session.boto_session.resource(
            "s3", region_name=self.sagemaker_session.boto_region_name
        )
        parsed_s3_url = urlparse(self.data_location)
        bucket, key_prefix = parsed_s3_url.netloc, parsed_s3_url.path
        key_prefix = key_prefix + "{}-{}/".format(type(self).__name__, sagemaker_timestamp())
        key_prefix = key_prefix.lstrip("/")
        logger.debug("Uploading to bucket %s and key_prefix %s", bucket, key_prefix)
        manifest_s3_file = upload_numpy_to_s3_shards(
            self.instance_count, s3, bucket, key_prefix, train, labels, encrypt
        )
        logger.debug("Created manifest file %s", manifest_s3_file)
        return RecordSet(
            manifest_s3_file,
            num_records=train.shape[0],
            feature_dim=train.shape[1],
            channel=channel,
        )
class AmazonAlgorithmEstimatorBase(EstimatorBase):
    """Base class for Amazon first-party Estimator implementations. This class isn't intended
    to be instantiated directly."""

    feature_dim = hp('feature_dim', validation.gt(0), data_type=int)
    mini_batch_size = hp('mini_batch_size', validation.gt(0), data_type=int)

    def __init__(self,
                 role,
                 train_instance_count,
                 train_instance_type,
                 data_location=None,
                 **kwargs):
        """Initialize an AmazonAlgorithmEstimatorBase.

        Args:
            data_location (str or None): The s3 prefix to upload RecordSet objects to, expressed as an
                S3 url. For example "s3://example-bucket/some-key-prefix/". Objects will be
                saved in a unique sub-directory of the specified location. If None, a default
                data location will be used."""
        super(AmazonAlgorithmEstimatorBase,
              self).__init__(role, train_instance_count, train_instance_type,
                             **kwargs)

        data_location = data_location or "s3://{}/sagemaker-record-sets/".format(
            self.sagemaker_session.default_bucket())
        self.data_location = data_location

    def train_image(self):
        repo = '{}:{}'.format(type(self).repo_name, type(self).repo_version)
        return '{}/{}'.format(
            registry(self.sagemaker_session.boto_region_name,
                     type(self).repo_name), repo)

    def hyperparameters(self):
        return hp.serialize_all(self)

    @property
    def data_location(self):
        return self._data_location

    @data_location.setter
    def data_location(self, data_location):
        if not data_location.startswith('s3://'):
            raise ValueError(
                'Expecting an S3 URL beginning with "s3://". Got "{}"'.format(
                    data_location))
        if data_location[-1] != '/':
            data_location = data_location + '/'
        self._data_location = data_location

    @classmethod
    def _prepare_init_params_from_job_description(cls, job_details):
        """Convert the job description to init params that can be handled by the class constructor

        Args:
            job_details: the returned job details from a describe_training_job API call.

        Returns:
             dictionary: The transformed init_params

        """
        init_params = super(
            AmazonAlgorithmEstimatorBase,
            cls)._prepare_init_params_from_job_description(job_details)

        # The hyperparam names may not be the same as the class attribute that holds them,
        # for instance: local_lloyd_init_method is called local_init_method. We need to map these
        # and pass the correct name to the constructor.
        for attribute, value in cls.__dict__.items():
            if isinstance(value, hp):
                if value.name in init_params['hyperparameters']:
                    init_params[attribute] = init_params['hyperparameters'][
                        value.name]

        del init_params['hyperparameters']
        del init_params['image']
        return init_params

    def _prepare_for_training(self,
                              records,
                              mini_batch_size=None,
                              job_name=None):
        """Set hyperparameters needed for training.

        Args:
            * records (:class:`~RecordSet`): The records to train this ``Estimator`` on.
            * mini_batch_size (int or None): The size of each mini-batch to use when training. If ``None``, a
                default value will be used.
            * job_name (str): Name of the training job to be created. If not specified, one is generated,
                using the base name given to the constructor if applicable.
        """
        super(AmazonAlgorithmEstimatorBase,
              self)._prepare_for_training(job_name=job_name)

        feature_dim = None

        if isinstance(records, list):
            for record in records:
                if record.channel == 'train':
                    feature_dim = record.feature_dim
                    break
            if feature_dim is None:
                raise ValueError('Must provide train channel.')
        else:
            feature_dim = records.feature_dim

        self.feature_dim = feature_dim
        self.mini_batch_size = mini_batch_size

    def fit(self,
            records,
            mini_batch_size=None,
            wait=True,
            logs=True,
            job_name=None):
        """Fit this Estimator on serialized Record objects, stored in S3.

        ``records`` should be an instance of :class:`~RecordSet`. This defines a collection of
        S3 data files to train this ``Estimator`` on.

        Training data is expected to be encoded as dense or sparse vectors in the "values" feature
        on each Record. If the data is labeled, the label is expected to be encoded as a list of
        scalas in the "values" feature of the Record label.

        More information on the Amazon Record format is available at:
        https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        See :meth:`~AmazonAlgorithmEstimatorBase.record_set` to construct a ``RecordSet`` object
        from :class:`~numpy.ndarray` arrays.

        Args:
            records (:class:`~RecordSet`): The records to train this ``Estimator`` on
            mini_batch_size (int or None): The size of each mini-batch to use when training. If ``None``, a
                default value will be used.
            wait (bool): Whether the call should wait until the job completes (default: True).
            logs (bool): Whether to show the logs produced by the job.
                Only meaningful when wait is True (default: True).
            job_name (str): Training job name. If not specified, the estimator generates a default job name,
                based on the training image name and current timestamp.
        """
        self._prepare_for_training(records,
                                   job_name=job_name,
                                   mini_batch_size=mini_batch_size)

        self.latest_training_job = _TrainingJob.start_new(self, records)
        if wait:
            self.latest_training_job.wait(logs=logs)

    def record_set(self, train, labels=None, channel="train"):
        """Build a :class:`~RecordSet` from a numpy :class:`~ndarray` matrix and label vector.

        For the 2D ``ndarray`` ``train``, each row is converted to a :class:`~Record` object.
        The vector is stored in the "values" entry of the ``features`` property of each Record.
        If ``labels`` is not None, each corresponding label is assigned to the "values" entry
        of the ``labels`` property of each Record.

        The collection of ``Record`` objects are protobuf serialized and uploaded to new
        S3 locations. A manifest file is generated containing the list of objects created and
        also stored in S3.

        The number of S3 objects created is controlled by the ``train_instance_count`` property
        on this Estimator. One S3 object is created per training instance.

        Args:
            train (numpy.ndarray): A 2D numpy array of training data.
            labels (numpy.ndarray): A 1D numpy array of labels. Its length must be equal to the
               number of rows in ``train``.
            channel (str): The SageMaker TrainingJob channel this RecordSet should be assigned to.
        Returns:
            RecordSet: A RecordSet referencing the encoded, uploading training and label data.
        """
        s3 = self.sagemaker_session.boto_session.resource('s3')
        parsed_s3_url = urlparse(self.data_location)
        bucket, key_prefix = parsed_s3_url.netloc, parsed_s3_url.path
        key_prefix = key_prefix + '{}-{}/'.format(
            type(self).__name__, sagemaker_timestamp())
        key_prefix = key_prefix.lstrip('/')
        logger.debug('Uploading to bucket {} and key_prefix {}'.format(
            bucket, key_prefix))
        manifest_s3_file = upload_numpy_to_s3_shards(self.train_instance_count,
                                                     s3, bucket, key_prefix,
                                                     train, labels)
        logger.debug("Created manifest file {}".format(manifest_s3_file))
        return RecordSet(manifest_s3_file,
                         num_records=train.shape[0],
                         feature_dim=train.shape[1],
                         channel=channel)
예제 #6
0
class RandomCutForest(AmazonAlgorithmEstimatorBase):

    repo_name = 'randomcutforest'
    repo_version = 1
    MINI_BATCH_SIZE = 1000

    eval_metrics = hp(
        name='eval_metrics',
        validation_message=
        'A comma separated list of "accuracy" or "precision_recall_fscore"',
        data_type=list)

    num_trees = hp('num_trees', (ge(50), le(1000)), 'An integer in [50, 1000]',
                   int)
    num_samples_per_tree = hp('num_samples_per_tree', (ge(1), le(2048)),
                              'An integer in [1, 2048]', int)
    feature_dim = hp("feature_dim", (ge(1), le(10000)),
                     'An integer in [1, 10000]', int)

    def __init__(self,
                 role,
                 train_instance_count,
                 train_instance_type,
                 num_samples_per_tree=None,
                 num_trees=None,
                 eval_metrics=None,
                 **kwargs):
        """RandomCutForest is :class:`Estimator` used for anomaly detection.

        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. It requires Amazon
        :class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3.
        There is an utility :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set` that
        can be used to upload data to S3 and creates :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed
        to the `fit` call.

        To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please
        consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
        Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an
        Endpoint, deploy returns a :class:`~sagemaker.amazon.ntm.RandomCutForestPredictor` object that can be used
        for inference calls using the trained model hosted in the SageMaker Endpoint.

        RandomCutForest Estimators can be configured by setting hyperparameters. The available hyperparameters for
        RandomCutForest are documented below.

        For further information on the AWS Random Cut Forest algorithm,
        please consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/randomcutforest.html

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
                APIs that create Amazon SageMaker endpoints use this role to access
                training data and model artifacts. After the endpoint is created,
                the inference code might use the IAM role, if accessing AWS resource.
            train_instance_count (int): Number of Amazon EC2 instances to use for training.
            train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
            num_samples_per_tree (int): Optional. The number of samples used to build each tree in the forest.
                The total number of samples drawn from the train dataset is num_trees * num_samples_per_tree.
            num_trees (int): Optional. The number of trees used in the forest.
            eval_metrics(list): Optional. JSON list of metrics types to be used for reporting the score for the model.
                Allowed values are "accuracy", "precision_recall_fscore": positive and negative precision, recall,
                and f1 scores. If test data is provided, the score shall be reported in terms of all requested metrics.
            **kwargs: base class keyword argument values.
        """

        super(RandomCutForest, self).__init__(role, train_instance_count,
                                              train_instance_type, **kwargs)
        self.num_samples_per_tree = num_samples_per_tree
        self.num_trees = num_trees
        self.eval_metrics = eval_metrics

    def create_model(self):
        """Return a :class:`~sagemaker.amazon.RandomCutForestModel` referencing the latest
        s3 model data produced by this Estimator."""

        return RandomCutForestModel(self.model_data,
                                    self.role,
                                    sagemaker_session=self.sagemaker_session)

    def fit(self, records, mini_batch_size=None, **kwargs):
        if mini_batch_size is None:
            mini_batch_size = RandomCutForest.MINI_BATCH_SIZE
        elif mini_batch_size != RandomCutForest.MINI_BATCH_SIZE:
            raise ValueError(
                "Random Cut Forest uses a fixed mini_batch_size of {}".format(
                    RandomCutForest.MINI_BATCH_SIZE))
        super(RandomCutForest, self).fit(records, mini_batch_size, **kwargs)
예제 #7
0
class KMeans(AmazonAlgorithmEstimatorBase):
    """An unsupervised learning algorithm that attempts to find discrete groupings within data.

    As the result of KMeans, members of a group are as similar as possible to one another and as
    different as possible from members of other groups. You define the attributes that you want
    the algorithm to use to determine similarity. """

    repo_name = "kmeans"
    repo_version = 1

    k = hp("k", gt(1), "An integer greater-than 1", int)
    init_method = hp("init_method", isin("random", "kmeans++"),
                     'One of "random", "kmeans++"', str)
    max_iterations = hp("local_lloyd_max_iter", gt(0),
                        "An integer greater-than 0", int)
    tol = hp("local_lloyd_tol", (ge(0), le(1)), "An float in [0, 1]", float)
    num_trials = hp("local_lloyd_num_trials", gt(0),
                    "An integer greater-than 0", int)
    local_init_method = hp("local_lloyd_init_method",
                           isin("random", "kmeans++"),
                           'One of "random", "kmeans++"', str)
    half_life_time_size = hp("half_life_time_size", ge(0),
                             "An integer greater-than-or-equal-to 0", int)
    epochs = hp("epochs", gt(0), "An integer greater-than 0", int)
    center_factor = hp("extra_center_factor", gt(0),
                       "An integer greater-than 0", int)
    eval_metrics = hp(
        name="eval_metrics",
        validation_message='A comma separated list of "msd" or "ssd"',
        data_type=list,
    )

    def __init__(self,
                 role,
                 instance_count,
                 instance_type,
                 k,
                 init_method=None,
                 max_iterations=None,
                 tol=None,
                 num_trials=None,
                 local_init_method=None,
                 half_life_time_size=None,
                 epochs=None,
                 center_factor=None,
                 eval_metrics=None,
                 **kwargs):
        """A k-means clustering
        :class:`~sagemaker.amazon.AmazonAlgorithmEstimatorBase`. Finds k
        clusters of data in an unlabeled dataset.

        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit_ndarray`
        or
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`.
        The former allows a KMeans model to be fit on a 2-dimensional numpy
        array. The latter requires Amazon
        :class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to
        be stored in S3.

        To learn more about the Amazon protobuf Record class and how to
        prepare bulk data in this format, please consult AWS technical
        documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html.

        After this Estimator is fit, model data is stored in S3. The model
        may be deployed to an Amazon SageMaker Endpoint by invoking
        :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as
        deploying an Endpoint, ``deploy`` returns a
        :class:`~sagemaker.amazon.kmeans.KMeansPredictor` object that can be
        used to k-means cluster assignments, using the trained k-means model
        hosted in the SageMaker Endpoint.

        KMeans Estimators can be configured by setting hyperparameters. The
        available hyperparameters for KMeans are documented below. For further
        information on the AWS KMeans algorithm, please consult AWS technical
        documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/k-means.html.

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon
                SageMaker training jobs and APIs that create Amazon SageMaker
                endpoints use this role to access training data and model
                artifacts. After the endpoint is created, the inference code
                might use the IAM role, if accessing AWS resource.
            instance_count (int): Number of Amazon EC2 instances to use
                for training.
            instance_type (str): Type of EC2 instance to use for training,
                for example, 'ml.c4.xlarge'.
            k (int): The number of clusters to produce.
            init_method (str): How to initialize cluster locations. One of
                'random' or 'kmeans++'.
            max_iterations (int): Maximum iterations for Lloyds EM procedure in
                the local kmeans used in finalize stage.
            tol (float): Tolerance for change in ssd for early stopping in local
                kmeans.
            num_trials (int): Local version is run multiple times and the one
                with the best loss is chosen. This determines how many times.
            local_init_method (str): Initialization method for local version.
                One of 'random', 'kmeans++'
            half_life_time_size (int): The points can have a decayed weight.
                When a point is observed its weight, with regard to the
                computation of the cluster mean is 1. This weight will decay
                exponentially as we observe more points. The exponent
                coefficient is chosen such that after observing
                ``half_life_time_size`` points after the mentioned point, its
                weight will become 1/2. If set to 0, there will be no decay.
            epochs (int): Number of passes done over the training data.
            center_factor (int): The algorithm will create
                ``num_clusters * extra_center_factor`` as it runs and reduce the
                number of centers to ``k`` when finalizing
            eval_metrics (list): JSON list of metrics types to be used for
                reporting the score for the model. Allowed values are "msd"
                Means Square Error, "ssd": Sum of square distance. If test data
                is provided, the score shall be reported in terms of all
                requested metrics.
            **kwargs: base class keyword argument values.

        .. tip::

            You can find additional parameters for initializing this class at
            :class:`~sagemaker.estimator.amazon_estimator.AmazonAlgorithmEstimatorBase` and
            :class:`~sagemaker.estimator.EstimatorBase`.
        """
        super(KMeans, self).__init__(role, instance_count, instance_type,
                                     **kwargs)
        self.k = k
        self.init_method = init_method
        self.max_iterations = max_iterations
        self.tol = tol
        self.num_trials = num_trials
        self.local_init_method = local_init_method
        self.half_life_time_size = half_life_time_size
        self.epochs = epochs
        self.center_factor = center_factor
        self.eval_metrics = eval_metrics

    def create_model(self, vpc_config_override=VPC_CONFIG_DEFAULT, **kwargs):
        """Return a :class:`~sagemaker.amazon.kmeans.KMeansModel` referencing
        the latest s3 model data produced by this Estimator.

        Args:
            vpc_config_override (dict[str, list[str]]): Optional override for
                VpcConfig set on the model.
                Default: use subnets and security groups from this Estimator.
                * 'Subnets' (list[str]): List of subnet ids.
                * 'SecurityGroupIds' (list[str]): List of security group ids.
            **kwargs: Additional kwargs passed to the KMeansModel constructor.
        """
        return KMeansModel(self.model_data,
                           self.role,
                           self.sagemaker_session,
                           vpc_config=self.get_vpc_config(vpc_config_override),
                           **kwargs)

    def _prepare_for_training(self,
                              records,
                              mini_batch_size=5000,
                              job_name=None):
        """
        Args:
            records:
            mini_batch_size:
            job_name:
        """
        super(KMeans,
              self)._prepare_for_training(records,
                                          mini_batch_size=mini_batch_size,
                                          job_name=job_name)

    def hyperparameters(self):
        """Return the SageMaker hyperparameters for training this KMeans
        Estimator
        """
        hp_dict = dict(force_dense="True"
                       )  # KMeans requires this hp to fit on Record objects
        hp_dict.update(super(KMeans, self).hyperparameters())
        return hp_dict
class LinearLearner(AmazonAlgorithmEstimatorBase):
    """A supervised learning algorithms used for solving classification or regression problems.

    For input, you give the model labeled examples (x, y). x is a high-dimensional vector and
    y is a numeric label. For binary classification problems, the label must be either 0 or 1.
    For multiclass classification problems, the labels must be from 0 to num_classes - 1. For
    regression problems, y is a real number. The algorithm learns a linear function, or, for
    classification problems, a linear threshold function, and maps a vector x to an approximation
    of the label y.
    """

    repo_name = "linear-learner"
    repo_version = 1

    DEFAULT_MINI_BATCH_SIZE = 1000

    binary_classifier_model_selection_criteria = hp(
        "binary_classifier_model_selection_criteria",
        isin(
            "accuracy",
            "f1",
            "f_beta",
            "precision_at_target_recall",
            "recall_at_target_precision",
            "cross_entropy_loss",
            "loss_function",
        ),
        data_type=str,
    )
    target_recall = hp("target_recall", (gt(0), lt(1)), "A float in (0,1)",
                       float)
    target_precision = hp("target_precision", (gt(0), lt(1)),
                          "A float in (0,1)", float)
    positive_example_weight_mult = hp(
        "positive_example_weight_mult", (),
        "A float greater than 0 or 'auto' or 'balanced'", str)
    epochs = hp("epochs", gt(0), "An integer greater-than 0", int)
    predictor_type = hp(
        "predictor_type",
        isin("binary_classifier", "regressor", "multiclass_classifier"),
        'One of "binary_classifier" or "multiclass_classifier" or "regressor"',
        str,
    )
    use_bias = hp("use_bias", (), "Either True or False", bool)
    num_models = hp("num_models", gt(0), "An integer greater-than 0", int)
    num_calibration_samples = hp("num_calibration_samples", gt(0),
                                 "An integer greater-than 0", int)
    init_method = hp("init_method", isin("uniform", "normal"),
                     'One of "uniform" or "normal"', str)
    init_scale = hp("init_scale", gt(0), "A float greater-than 0", float)
    init_sigma = hp("init_sigma", gt(0), "A float greater-than 0", float)
    init_bias = hp("init_bias", (), "A number", float)
    optimizer = hp(
        "optimizer",
        isin("sgd", "adam", "rmsprop", "auto"),
        'One of "sgd", "adam", "rmsprop" or "auto',
        str,
    )
    loss = hp(
        "loss",
        isin(
            "logistic",
            "squared_loss",
            "absolute_loss",
            "hinge_loss",
            "eps_insensitive_squared_loss",
            "eps_insensitive_absolute_loss",
            "quantile_loss",
            "huber_loss",
            "softmax_loss",
            "auto",
        ),
        '"logistic", "squared_loss", "absolute_loss", "hinge_loss", "eps_insensitive_squared_loss",'
        ' "eps_insensitive_absolute_loss", "quantile_loss", "huber_loss", "softmax_loss" or "auto"',
        str,
    )
    wd = hp("wd", ge(0), "A float greater-than or equal to 0", float)
    l1 = hp("l1", ge(0), "A float greater-than or equal to 0", float)
    momentum = hp("momentum", (ge(0), lt(1)), "A float in [0,1)", float)
    learning_rate = hp("learning_rate", gt(0), "A float greater-than 0", float)
    beta_1 = hp("beta_1", (ge(0), lt(1)), "A float in [0,1)", float)
    beta_2 = hp("beta_2", (ge(0), lt(1)), "A float in [0,1)", float)
    bias_lr_mult = hp("bias_lr_mult", gt(0), "A float greater-than 0", float)
    bias_wd_mult = hp("bias_wd_mult", ge(0),
                      "A float greater-than or equal to 0", float)
    use_lr_scheduler = hp("use_lr_scheduler", (), "A boolean", bool)
    lr_scheduler_step = hp("lr_scheduler_step", gt(0),
                           "An integer greater-than 0", int)
    lr_scheduler_factor = hp("lr_scheduler_factor", (gt(0), lt(1)),
                             "A float in (0,1)", float)
    lr_scheduler_minimum_lr = hp("lr_scheduler_minimum_lr", gt(0),
                                 "A float greater-than 0", float)
    normalize_data = hp("normalize_data", (), "A boolean", bool)
    normalize_label = hp("normalize_label", (), "A boolean", bool)
    unbias_data = hp("unbias_data", (), "A boolean", bool)
    unbias_label = hp("unbias_label", (), "A boolean", bool)
    num_point_for_scaler = hp("num_point_for_scaler", gt(0),
                              "An integer greater-than 0", int)
    margin = hp("margin", ge(0), "A float greater-than or equal to 0", float)
    quantile = hp("quantile", (gt(0), lt(1)), "A float in (0,1)", float)
    loss_insensitivity = hp("loss_insensitivity", gt(0),
                            "A float greater-than 0", float)
    huber_delta = hp("huber_delta", ge(0),
                     "A float greater-than or equal to 0", float)
    early_stopping_patience = hp("early_stopping_patience", gt(0),
                                 "An integer greater-than 0", int)
    early_stopping_tolerance = hp("early_stopping_tolerance", gt(0),
                                  "A float greater-than 0", float)
    num_classes = hp("num_classes", (gt(0), le(1000000)),
                     "An integer in [1,1000000]", int)
    accuracy_top_k = hp("accuracy_top_k", (gt(0), le(1000000)),
                        "An integer in [1,1000000]", int)
    f_beta = hp("f_beta", gt(0), "A float greater-than 0", float)
    balance_multiclass_weights = hp("balance_multiclass_weights", (),
                                    "A boolean", bool)

    def __init__(self,
                 role,
                 instance_count=None,
                 instance_type=None,
                 predictor_type=None,
                 binary_classifier_model_selection_criteria=None,
                 target_recall=None,
                 target_precision=None,
                 positive_example_weight_mult=None,
                 epochs=None,
                 use_bias=None,
                 num_models=None,
                 num_calibration_samples=None,
                 init_method=None,
                 init_scale=None,
                 init_sigma=None,
                 init_bias=None,
                 optimizer=None,
                 loss=None,
                 wd=None,
                 l1=None,
                 momentum=None,
                 learning_rate=None,
                 beta_1=None,
                 beta_2=None,
                 bias_lr_mult=None,
                 bias_wd_mult=None,
                 use_lr_scheduler=None,
                 lr_scheduler_step=None,
                 lr_scheduler_factor=None,
                 lr_scheduler_minimum_lr=None,
                 normalize_data=None,
                 normalize_label=None,
                 unbias_data=None,
                 unbias_label=None,
                 num_point_for_scaler=None,
                 margin=None,
                 quantile=None,
                 loss_insensitivity=None,
                 huber_delta=None,
                 early_stopping_patience=None,
                 early_stopping_tolerance=None,
                 num_classes=None,
                 accuracy_top_k=None,
                 f_beta=None,
                 balance_multiclass_weights=None,
                 **kwargs):
        """An :class:`Estimator` for binary classification and regression.

        Amazon SageMaker Linear Learner provides a solution for both
        classification and regression problems, allowing for exploring different
        training objectives simultaneously and choosing the best solution from a
        validation set. It allows the user to explore a large number of models
        and choose the best, which optimizes either continuous objectives such
        as mean square error, cross entropy loss, absolute error, etc., or
        discrete objectives suited for classification such as F1 measure,
        precision@recall, accuracy. The implementation provides a significant
        speedup over naive hyperparameter optimization techniques and an added
        convenience, when compared with solutions providing a solution only to
        continuous objectives.

        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit_ndarray`
        or
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`.
        The former allows a LinearLearner model to be fit on a 2-dimensional
        numpy array. The latter requires Amazon
        :class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to
        be stored in S3.

        To learn more about the Amazon protobuf Record class and how to
        prepare bulk data in this format, please consult AWS technical
        documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        After this Estimator is fit, model data is stored in S3. The model
        may be deployed to an Amazon SageMaker Endpoint by invoking
        :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as
        deploying an Endpoint, ``deploy`` returns a
        :class:`~sagemaker.amazon.linear_learner.LinearLearnerPredictor` object
        that can be used to make class or regression predictions, using the
        trained model.

        LinearLearner Estimators can be configured by setting
        hyperparameters. The available hyperparameters for LinearLearner are
        documented below. For further information on the AWS LinearLearner
        algorithm, please consult AWS technical documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/linear-learner.html

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon
                SageMaker training jobs and APIs that create Amazon SageMaker
                endpoints use this role to access training data and model
                artifacts. After the endpoint is created, the inference code
                might use the IAM role, if accessing AWS resource.
            instance_count (int): Number of Amazon EC2 instances to use
                for training.
            instance_type (str): Type of EC2 instance to use for training,
                for example, 'ml.c4.xlarge'.
            predictor_type (str): The type of predictor to learn. Either
                "binary_classifier" or "multiclass_classifier" or "regressor".
            binary_classifier_model_selection_criteria (str): One of 'accuracy',
                'f1', 'f_beta', 'precision_at_target_recall', 'recall_at_target_precision',
                'cross_entropy_loss', 'loss_function'
            target_recall (float): Target recall. Only applicable if
                binary_classifier_model_selection_criteria is
                precision_at_target_recall.
            target_precision (float): Target precision. Only applicable if
                binary_classifier_model_selection_criteria is
                recall_at_target_precision.
            positive_example_weight_mult (float): The importance weight of
                positive examples is multiplied by this constant. Useful for
                skewed datasets. Only applies for classification tasks.
            epochs (int): The maximum number of passes to make over the training
                data.
            use_bias (bool): Whether to include a bias field
            num_models (int): Number of models to train in parallel. If not set,
                the number of parallel models to train will be decided by the
                algorithm itself. One model will be trained according to the
                given training parameter (regularization, optimizer, loss) and
                the rest by close by parameters.
            num_calibration_samples (int): Number of observations to use from
                validation dataset for doing model calibration (finding the best threshold).
            init_method (str): Function to use to set the initial model weights.
                One of "uniform" or "normal"
            init_scale (float): For "uniform" init, the range of values.
            init_sigma (float): For "normal" init, the standard-deviation.
            init_bias (float): Initial weight for bias term
            optimizer (str): One of 'sgd', 'adam', 'rmsprop' or 'auto'
            loss (str): One of 'logistic', 'squared_loss', 'absolute_loss',
                'hinge_loss', 'eps_insensitive_squared_loss', 'eps_insensitive_absolute_loss',
                'quantile_loss', 'huber_loss' or
            'softmax_loss' or 'auto'.
            wd (float): L2 regularization parameter i.e. the weight decay
                parameter. Use 0 for no L2 regularization.
            l1 (float): L1 regularization parameter. Use 0 for no L1
                regularization.
            momentum (float): Momentum parameter of sgd optimizer.
            learning_rate (float): The SGD learning rate
            beta_1 (float): Exponential decay rate for first moment estimates.
                Only applies for adam optimizer.
            beta_2 (float): Exponential decay rate for second moment estimates.
                Only applies for adam optimizer.
            bias_lr_mult (float): Allows different learning rate for the bias
                term. The actual learning rate for the bias is learning rate times bias_lr_mult.
            bias_wd_mult (float): Allows different regularization for the bias
                term. The actual L2 regularization weight for the bias is wd times bias_wd_mult.
                By default there is no regularization on the bias term.
            use_lr_scheduler (bool): If true, we use a scheduler for the
                learning rate.
            lr_scheduler_step (int): The number of steps between decreases of
                the learning rate. Only applies to learning rate scheduler.
            lr_scheduler_factor (float): Every lr_scheduler_step the learning
                rate will decrease by this quantity. Only applies for learning
                rate scheduler.
            lr_scheduler_minimum_lr (float): The learning rate will never
                decrease to a value lower than this. Only applies for learning rate scheduler.
            normalize_data (bool): Normalizes the features before training to
                have standard deviation of 1.0.
            normalize_label (bool): Normalizes the regression label to have a
                standard deviation of 1.0. If set for classification, it will be
                ignored.
            unbias_data (bool): If true, features are modified to have mean 0.0.
            unbias_label (bool): If true, labels are modified to have mean 0.0.
            num_point_for_scaler (int): The number of data points to use for
                calculating the normalizing and unbiasing terms.
            margin (float): the margin for hinge_loss.
            quantile (float): Quantile for quantile loss. For quantile q, the
                model will attempt to produce predictions such that true_label < prediction with
                probability q.
            loss_insensitivity (float): Parameter for epsilon insensitive loss
                type. During training and metric evaluation, any error smaller than this is
                considered to be zero.
            huber_delta (float): Parameter for Huber loss. During training and
                metric evaluation, compute L2 loss for errors smaller than delta and L1 loss for
                errors larger than delta.
            early_stopping_patience (int): the number of epochs to wait before ending training
            if no improvement is made. The improvement is training loss if validation data is
            not provided, or else it is the validation loss or the binary classification model
            selection criteria like accuracy, f1-score etc. To disable early stopping,
            set early_stopping_patience to a value larger than epochs.
            early_stopping_tolerance (float): Relative tolerance to measure an
                improvement in loss. If the ratio of the improvement in loss divided by the
                previous best loss is smaller than this value, early stopping will
            consider the improvement to be zero.
            num_classes (int): The number of classes for the response variable.
                Required when predictor_type is multiclass_classifier and ignored otherwise. The
                classes are assumed to be labeled 0, ..., num_classes - 1.
            accuracy_top_k (int): The value of k when computing the Top K
                Accuracy metric for multiclass classification. An example is scored as correct
                if the model assigns one of the top k scores to the true
            label.
            f_beta (float): The value of beta to use when calculating F score
                metrics for binary or multiclass classification. Also used if
                binary_classifier_model_selection_criteria is f_beta.
            balance_multiclass_weights (bool): Whether to use class weights
                which give each class equal importance in the loss function. Only used when
                predictor_type is multiclass_classifier.
            **kwargs: base class keyword argument values.

        .. tip::

            You can find additional parameters for initializing this class at
            :class:`~sagemaker.estimator.amazon_estimator.AmazonAlgorithmEstimatorBase` and
            :class:`~sagemaker.estimator.EstimatorBase`.
        """
        super(LinearLearner, self).__init__(role, instance_count,
                                            instance_type, **kwargs)
        self.predictor_type = predictor_type
        self.binary_classifier_model_selection_criteria = binary_classifier_model_selection_criteria
        self.target_recall = target_recall
        self.target_precision = target_precision
        self.positive_example_weight_mult = positive_example_weight_mult
        self.epochs = epochs
        self.use_bias = use_bias
        self.num_models = num_models
        self.num_calibration_samples = num_calibration_samples
        self.init_method = init_method
        self.init_scale = init_scale
        self.init_sigma = init_sigma
        self.init_bias = init_bias
        self.optimizer = optimizer
        self.loss = loss
        self.wd = wd
        self.l1 = l1
        self.momentum = momentum
        self.learning_rate = learning_rate
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.bias_lr_mult = bias_lr_mult
        self.bias_wd_mult = bias_wd_mult
        self.use_lr_scheduler = use_lr_scheduler
        self.lr_scheduler_step = lr_scheduler_step
        self.lr_scheduler_factor = lr_scheduler_factor
        self.lr_scheduler_minimum_lr = lr_scheduler_minimum_lr
        self.normalize_data = normalize_data
        self.normalize_label = normalize_label
        self.unbias_data = unbias_data
        self.unbias_label = unbias_label
        self.num_point_for_scaler = num_point_for_scaler
        self.margin = margin
        self.quantile = quantile
        self.loss_insensitivity = loss_insensitivity
        self.huber_delta = huber_delta
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_tolerance = early_stopping_tolerance
        self.num_classes = num_classes
        self.accuracy_top_k = accuracy_top_k
        self.f_beta = f_beta
        self.balance_multiclass_weights = balance_multiclass_weights

        if self.predictor_type == "multiclass_classifier" and (
                num_classes is None or int(num_classes) < 3):
            raise ValueError(
                "For predictor_type 'multiclass_classifier', 'num_classes' should be set to a "
                "value greater than 2.")

    def create_model(self, vpc_config_override=VPC_CONFIG_DEFAULT, **kwargs):
        """Return a :class:`~sagemaker.amazon.LinearLearnerModel`.

        It references the latest s3 model data produced by this Estimator.

        Args:
            vpc_config_override (dict[str, list[str]]): Optional override for VpcConfig set on
                the model. Default: use subnets and security groups from this Estimator.
                * 'Subnets' (list[str]): List of subnet ids.
                * 'SecurityGroupIds' (list[str]): List of security group ids.
            **kwargs: Additional kwargs passed to the LinearLearnerModel constructor.
        """
        return LinearLearnerModel(
            self.model_data,
            self.role,
            self.sagemaker_session,
            vpc_config=self.get_vpc_config(vpc_config_override),
            **kwargs)

    def _prepare_for_training(self,
                              records,
                              mini_batch_size=None,
                              job_name=None):
        """Placeholder docstring"""
        num_records = None
        if isinstance(records, list):
            for record in records:
                if record.channel == "train":
                    num_records = record.num_records
                    break
            if num_records is None:
                raise ValueError("Must provide train channel.")
        else:
            num_records = records.num_records

        # mini_batch_size can't be greater than number of records or training job fails
        default_mini_batch_size = min(
            self.DEFAULT_MINI_BATCH_SIZE,
            max(1, int(num_records / self.instance_count)))
        mini_batch_size = mini_batch_size or default_mini_batch_size
        super(LinearLearner,
              self)._prepare_for_training(records,
                                          mini_batch_size=mini_batch_size,
                                          job_name=job_name)
예제 #9
0
class KMeans(AmazonAlgorithmEstimatorBase):

    repo = 'kmeans:1'

    k = hp('k', gt(1), 'An integer greater-than 1', int)
    init_method = hp('init_method', isin('random', 'kmeans++'),
                     'One of "random", "kmeans++"', str)
    max_iterations = hp('local_lloyd_max_iterations', gt(0),
                        'An integer greater-than 0', int)
    tol = hp('local_lloyd_tol', gt(0), 'An integer greater-than 0', int)
    num_trials = hp('local_lloyd_num_trials', gt(0),
                    'An integer greater-than 0', int)
    local_init_method = hp('local_lloyd_init_method',
                           isin('random', 'kmeans++'),
                           'One of "random", "kmeans++"', str)
    half_life_time_size = hp('half_life_time_size', ge(0),
                             'An integer greater-than-or-equal-to 0', int)
    epochs = hp('epochs', gt(0), 'An integer greater-than 0', int)
    center_factor = hp('extra_center_factor', gt(0),
                       'An integer greater-than 0', int)

    def __init__(self,
                 role,
                 train_instance_count,
                 train_instance_type,
                 k,
                 init_method=None,
                 max_iterations=None,
                 tol=None,
                 num_trials=None,
                 local_init_method=None,
                 half_life_time_size=None,
                 epochs=None,
                 center_factor=None,
                 **kwargs):
        """
        A k-means clustering :class:`~sagemaker.amazon.AmazonAlgorithmEstimatorBase`. Finds k clusters of data in an
        unlabeled dataset.

        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit_ndarray`
        or :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. The former allows a KMeans model
        to be fit on a 2-dimensional numpy array. The latter requires Amazon
        :class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3.

        To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please
        consult AWS technical documentation: https://alpha-docs-aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
        Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an Endpoint,
        ``deploy`` returns a :class:`~sagemaker.amazon.kmeans.KMeansPredictor` object that can be used to k-means
        cluster assignments, using the trained k-means model hosted in the SageMaker Endpoint.

        KMeans Estimators can be configured by setting hyperparameters. The available hyperparameters for KMeans
        are documented below. For further information on the AWS KMeans algorithm, please consult AWS technical
        documentation: https://alpha-docs-aws.amazon.com/sagemaker/latest/dg/k-means.html

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
                APIs that create Amazon SageMaker endpoints use this role to access
                training data and model artifacts. After the endpoint is created,
                the inference code might use the IAM role, if accessing AWS resource.
                For more information, see <link>???.
            train_instance_count (int): Number of Amazon EC2 instances to use for training.
            train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
            k (int): The number of clusters to produce.
            init_method (str): How to initialize cluster locations. One of 'random' or 'kmeans++'.
            max_iterations (int): Maximum iterations for Lloyds EM procedure in the local kmeans used in finalize stage.
            tol (int): Tolerance for change in ssd for early stopping in local kmeans.
            num_trials (int): Local version is run multiple times and the one with the best loss is chosen. This
                              determines how many times.
            local_init_method (str): Initialization method for local version. One of 'random', 'kmeans++'
            half_life_time_size (int): The points can have a decayed weight. When a point is observed its weight,
                with regard to the computation of the cluster mean is 1. This weight will decay exponentially as we
                observe more points. The exponent coefficient is chosen such that after observing
                ``half_life_time_size`` points after the mentioned point, its weight will become 1/2. If set to 0,
                there will be no decay.
            epochs (int): Number of passes done over the training data.
            center_factor(int): The algorithm will create ``num_clusters * extra_center_factor`` as it runs and
                reduce the number of centers to ``k`` when finalizing
            **kwargs: base class keyword argument values.
        """
        super(KMeans, self).__init__(role, train_instance_count,
                                     train_instance_type, **kwargs)
        self.k = k
        self.init_method = init_method
        self.max_iterations = max_iterations
        self.tol = tol
        self.num_trials = num_trials
        self.local_init_method = local_init_method
        self.half_life_time_size = half_life_time_size
        self.epochs = epochs
        self.center_factor = center_factor

    def create_model(self):
        """Return a :class:`~sagemaker.amazon.kmeans.KMeansModel` referencing the latest
        s3 model data produced by this Estimator."""
        return KMeansModel(self.model_data, self.role, self.sagemaker_session)

    def fit(self, records, mini_batch_size=5000, **kwargs):
        super(KMeans, self).fit(records, mini_batch_size, **kwargs)

    def hyperparameters(self):
        """Return the SageMaker hyperparameters for training this KMeans Estimator"""
        hp = dict(force_dense='True'
                  )  # KMeans requires this hp to fit on Record objects
        hp.update(super(KMeans, self).hyperparameters())
        return hp
예제 #10
0
class IPInsights(AmazonAlgorithmEstimatorBase):
    """Placeholder docstring"""

    repo_name = "ipinsights"
    repo_version = 1
    MINI_BATCH_SIZE = 10000

    num_entity_vectors = hp("num_entity_vectors", (ge(1), le(250000000)),
                            "An integer in [1, 250000000]", int)
    vector_dim = hp("vector_dim", (ge(4), le(4096)), "An integer in [4, 4096]",
                    int)

    batch_metrics_publish_interval = hp("batch_metrics_publish_interval",
                                        (ge(1)), "An integer greater than 0",
                                        int)
    epochs = hp("epochs", (ge(1)), "An integer greater than 0", int)
    learning_rate = hp("learning_rate", (ge(1e-6), le(10.0)),
                       "A float in [1e-6, 10.0]", float)
    num_ip_encoder_layers = hp("num_ip_encoder_layers", (ge(0), le(100)),
                               "An integer in [0, 100]", int)
    random_negative_sampling_rate = hp("random_negative_sampling_rate",
                                       (ge(0), le(500)),
                                       "An integer in [0, 500]", int)
    shuffled_negative_sampling_rate = hp("shuffled_negative_sampling_rate",
                                         (ge(0), le(500)),
                                         "An integer in [0, 500]", int)
    weight_decay = hp("weight_decay", (ge(0.0), le(10.0)),
                      "A float in [0.0, 10.0]", float)

    def __init__(self,
                 role,
                 train_instance_count,
                 train_instance_type,
                 num_entity_vectors,
                 vector_dim,
                 batch_metrics_publish_interval=None,
                 epochs=None,
                 learning_rate=None,
                 num_ip_encoder_layers=None,
                 random_negative_sampling_rate=None,
                 shuffled_negative_sampling_rate=None,
                 weight_decay=None,
                 **kwargs):
        """This estimator is for IP Insights, an unsupervised algorithm that
        learns usage patterns of IP addresses.

        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`.
        It requires CSV data to be stored in S3.

        After this Estimator is fit, model data is stored in S3. The model
        may be deployed to an Amazon SageMaker Endpoint by invoking
        :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as
        deploying an Endpoint, deploy returns a
        :class:`~sagemaker.amazon.IPInsightPredictor` object that can be used
        for inference calls using the trained model hosted in the SageMaker
        Endpoint.

        IPInsights Estimators can be configured by setting hyperparamters.
        The available hyperparamters are documented below.

        For further information on the AWS IPInsights algorithm, please
        consult AWS technical documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/ip-insights-hyperparameters.html

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon
                SageMaker training jobs and APIs that create Amazon SageMaker
                endpoints use this role to access training data and model
                artifacts. After the endpoint is created, the inference code
                might use the IAM role, if accessing AWS resource.
            train_instance_count (int): Number of Amazon EC2 instances to use
                for training.
            train_instance_type (str): Type of EC2 instance to use for training,
                for example, 'ml.m5.xlarge'.
            num_entity_vectors (int): Required. The number of embeddings to
                train for entities accessing online resources. We recommend 2x
                the total number of unique entity IDs.
            vector_dim (int): Required. The size of the embedding vectors for
                both entity and IP addresses.
            batch_metrics_publish_interval (int): Optional. The period at which
                to publish metrics (batches).
            epochs (int): Optional. Maximum number of passes over the training
                data.
            learning_rate (float): Optional. Learning rate for the optimizer.
            num_ip_encoder_layers (int): Optional. The number of fully-connected
                layers to encode IP address embedding.
            random_negative_sampling_rate (int): Optional. The ratio of random
                negative samples to draw during training. Random negative
                samples are randomly drawn IPv4 addresses.
            shuffled_negative_sampling_rate (int): Optional. The ratio of
                shuffled negative samples to draw during training. Shuffled
                negative samples are IP addresses picked from within a batch.
            weight_decay (float): Optional. Weight decay coefficient. Adds L2
                regularization.
            **kwargs: base class keyword argument values.

        .. tip::

            You can find additional parameters for initializing this class at
            :class:`~sagemaker.estimator.amazon_estimator.AmazonAlgorithmEstimatorBase` and
            :class:`~sagemaker.estimator.EstimatorBase`.
        """
        super(IPInsights, self).__init__(role, train_instance_count,
                                         train_instance_type, **kwargs)
        self.num_entity_vectors = num_entity_vectors
        self.vector_dim = vector_dim
        self.batch_metrics_publish_interval = batch_metrics_publish_interval
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.num_ip_encoder_layers = num_ip_encoder_layers
        self.random_negative_sampling_rate = random_negative_sampling_rate
        self.shuffled_negative_sampling_rate = shuffled_negative_sampling_rate
        self.weight_decay = weight_decay

    def create_model(self, vpc_config_override=VPC_CONFIG_DEFAULT, **kwargs):
        """Create a model for the latest s3 model produced by this estimator.

        Args:
            vpc_config_override (dict[str, list[str]]): Optional override for VpcConfig set on
                the model.
                Default: use subnets and security groups from this Estimator.
                * 'Subnets' (list[str]): List of subnet ids.
                * 'SecurityGroupIds' (list[str]): List of security group ids.
            **kwargs: Additional kwargs passed to the IPInsightsModel constructor.
        Returns:
            :class:`~sagemaker.amazon.IPInsightsModel`: references the latest s3 model
            data produced by this estimator.
        """
        return IPInsightsModel(
            self.model_data,
            self.role,
            sagemaker_session=self.sagemaker_session,
            vpc_config=self.get_vpc_config(vpc_config_override),
            **kwargs)

    def _prepare_for_training(self,
                              records,
                              mini_batch_size=None,
                              job_name=None):
        """
        Args:
            records:
            mini_batch_size:
            job_name:
        """
        if mini_batch_size is not None and (mini_batch_size < 1
                                            or mini_batch_size > 500000):
            raise ValueError("mini_batch_size must be in [1, 500000]")
        super(IPInsights,
              self)._prepare_for_training(records,
                                          mini_batch_size=mini_batch_size,
                                          job_name=job_name)
예제 #11
0
class PCA(AmazonAlgorithmEstimatorBase):

    repo = 'pca:1'

    num_components = hp(
        name='num_components',
        validate=lambda x: x > 0 and isinstance(x, int),
        validation_message='Value must be an integer greater than zero')
    algorithm_mode = hp(
        name='algorithm_mode',
        validate=lambda x: x in ['regular', 'stable', 'randomized'],
        validation_message=
        'Value must be one of "regular", "stable", "randomized"')
    subtract_mean = hp(name='subtract_mean',
                       validate=lambda x: isinstance(x, bool),
                       validation_message='Value must be a boolean')
    extra_components = hp(
        name='extra_components',
        validate=lambda x: x >= 0 and isinstance(x, int),
        validation_message="Value must be an integer greater than or equal to 0"
    )

    def __init__(self,
                 role,
                 train_instance_count,
                 train_instance_type,
                 num_components,
                 algorithm_mode=None,
                 subtract_mean=None,
                 extra_components=None,
                 **kwargs):
        """A Principal Components Analysis (PCA) :class:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase`.

        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit_ndarray`
        or :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. The former allows a PCA model
        to be fit on a 2-dimensional numpy array. The latter requires Amazon
        :class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3.

        To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please
        consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
        Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an Endpoint,
        deploy returns a :class:`~sagemaker.amazon.pca.PCAPredictor` object that can be used to project
        input vectors to the learned lower-dimensional representation, using the trained PCA model hosted in the
        SageMaker Endpoint.

        PCA Estimators can be configured by setting hyperparameters. The available hyperparameters for PCA
        are documented below. For further information on the AWS PCA algorithm, please consult AWS technical
        documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/pca.html

        This Estimator uses Amazon SageMaker PCA to perform training and host deployed models. To
        learn more about Amazon SageMaker PCA, please read:
        https://docs.aws.amazon.com/sagemaker/latest/dg/how-pca-works.html

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
                APIs that create Amazon SageMaker endpoints use this role to access
                training data and model artifacts. After the endpoint is created,
                the inference code might use the IAM role, if accessing AWS resource.
            train_instance_count (int): Number of Amazon EC2 instances to use for training.
            train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
            num_components(int): The number of principal components. Must be greater than zero.
            algorithm_mode (str): Mode for computing the principal components. One of 'regular', 'stable' or
                'randomized'.
            subtract_mean (bool): Whether the data should be unbiased both during train and at inference.
            extra_components (int): As the value grows larger, the solution becomes more accurate but the
                runtime and memory consumption increase linearly. If this value is unset, then a default value equal
                to the maximum of 10 and num_components will be used. Valid for randomized mode only.
            **kwargs: base class keyword argument values.
        """
        super(PCA, self).__init__(role, train_instance_count,
                                  train_instance_type, **kwargs)
        self.num_components = num_components
        self.algorithm_mode = algorithm_mode
        self.subtract_mean = subtract_mean
        self.extra_components = extra_components

    def create_model(self):
        """Return a :class:`~sagemaker.amazon.pca.PCAModel` referencing the latest
        s3 model data produced by this Estimator."""

        return PCAModel(self.model_data,
                        self.role,
                        sagemaker_session=self.sagemaker_session)
예제 #12
0
class AmazonS3AlgorithmEstimatorBase(EstimatorBase):
    """Base class for Amazon first-party Estimator implementations. This class isn't
    intended to be instantiated directly. This is difference from the base class
    because this class handles S3 data"""

    mini_batch_size = hp('mini_batch_size', (validation, validation.gt(0)))

    def __init__(self, role, train_instance_count, train_instance_type,
                 algorithm, **kwargs):
        """Initialize an AmazonAlgorithmEstimatorBase.

        Args:
            algorithm (str): Use one of the supported algorithms
                """
        super(AmazonS3AlgorithmEstimatorBase,
              self).__init__(role, train_instance_count, train_instance_type,
                             **kwargs)
        self.algorithm = algorithm

    def train_image(self):
        return registry(self.sagemaker_session.boto_region_name,
                        algorithm=self.algorithm) + "/" + type(self).repo

    def hyperparameters(self):
        return hp.serialize_all(self)

    def fit(self,
            s3set,
            mini_batch_size=None,
            distribution='ShardedByS3Key',
            **kwargs):
        """Fit this Estimator on serialized Record objects, stored in S3.

        ``records`` should be a list of instances of :class:`~RecordSet`. This defines a collection of
        s3 data files to train this ``Estimator`` on.

        More information on the Amazon Record format is available at:
        https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        See :meth:`~AmazonS3AlgorithmEstimatorBase.s3_record_set` to construct a ``RecordSet`` object
        from :class:`~numpy.ndarray` arrays.

        Args:
            s3set (list): This is a list of :class:`~S3Set` items The list of records to train
                    this ``Estimator`` will depend on each algorithm and type of input data.
            distribution (str): The s3 distribution of data.
            mini_batch_size (int or None): The size of each mini-batch to use when training. If None, a
                    default value will be used.
        """
        default_mini_batch_size = 32
        self.mini_batch_size = mini_batch_size or default_mini_batch_size
        data = {}
        for item in s3set:
            data[item.channel] = s3_input(item.s3_location,
                                          distribution=item.distribution,
                                          content_type=item.content_type,
                                          s3_data_type=item.s3_data_type)
        super(AmazonS3AlgorithmEstimatorBase, self).fit(data, **kwargs)

    def s3_record_set(self, s3_loc, content_type, channel="train"):
        """Build a  :class:`~RecordSet` from a S3 location with data in it.

        Args:
            s3_loc (str): A s3 bucket where data is located
            channel (str): The SageMaker TrainingJob channel this RecordSet should be assigned to.
            content_type (str): Content type of the data.
        Returns:
            RecordSet: A RecordSet referencing the encoded, uploading training and label data.
        """
        return S3Set(s3_loc, content_type=content_type, channel=channel)
예제 #13
0
class NTM(AmazonAlgorithmEstimatorBase):

    repo_name = 'ntm'
    repo_version = 1

    num_topics = hp('num_topics', (ge(2), le(1000)), 'An integer in [2, 1000]',
                    int)
    encoder_layers = hp(name='encoder_layers',
                        validation_message='A comma separated list of '
                        'positive integers',
                        data_type=list)
    epochs = hp('epochs', (ge(1), le(100)), 'An integer in [1, 100]', int)
    encoder_layers_activation = hp('encoder_layers_activation',
                                   isin('sigmoid', 'tanh', 'relu'),
                                   'One of "sigmoid", "tanh" or "relu"', str)
    optimizer = hp(
        'optimizer', isin('adagrad', 'adam', 'rmsprop', 'sgd', 'adadelta'),
        'One of "adagrad", "adam", "rmsprop", "sgd" and "adadelta"', str)
    tolerance = hp('tolerance', (ge(1e-6), le(0.1)), 'A float in [1e-6, 0.1]',
                   float)
    num_patience_epochs = hp('num_patience_epochs', (ge(1), le(10)),
                             'An integer in [1, 10]', int)
    batch_norm = hp(name='batch_norm',
                    validation_message='Value must be a boolean',
                    data_type=bool)
    rescale_gradient = hp('rescale_gradient', (ge(1e-3), le(1.0)),
                          'A float in [1e-3, 1.0]', float)
    clip_gradient = hp('clip_gradient', ge(1e-3),
                       'A float greater equal to 1e-3', float)
    weight_decay = hp('weight_decay', (ge(0.0), le(1.0)),
                      'A float in [0.0, 1.0]', float)
    learning_rate = hp('learning_rate', (ge(1e-6), le(1.0)),
                       'A float in [1e-6, 1.0]', float)

    def __init__(self,
                 role,
                 train_instance_count,
                 train_instance_type,
                 num_topics,
                 encoder_layers=None,
                 epochs=None,
                 encoder_layers_activation=None,
                 optimizer=None,
                 tolerance=None,
                 num_patience_epochs=None,
                 batch_norm=None,
                 rescale_gradient=None,
                 clip_gradient=None,
                 weight_decay=None,
                 learning_rate=None,
                 **kwargs):
        """Neural Topic Model (NTM) is :class:`Estimator` used for unsupervised learning.

        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. It requires Amazon
        :class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3.
        There is an utility :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set` that
        can be used to upload data to S3 and creates :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed
        to the `fit` call.

        To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please
        consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
        Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an Endpoint,
        deploy returns a :class:`~sagemaker.amazon.ntm.NTMPredictor` object that can be used
        for inference calls using the trained model hosted in the SageMaker Endpoint.

        NTM Estimators can be configured by setting hyperparameters. The available hyperparameters for
        NTM are documented below.

        For further information on the AWS NTM algorithm,
        please consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/ntm.html

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
                APIs that create Amazon SageMaker endpoints use this role to access
                training data and model artifacts. After the endpoint is created,
                the inference code might use the IAM role, if accessing AWS resource.
            train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
            num_topics (int): Required. The number of topics for NTM to find within the data.
            encoder_layers (list): Optional. Represents number of layers in the encoder and the output size of
                each layer.
            epochs (int): Optional. Maximum number of passes over the training data.
            encoder_layers_activation (str): Optional. Activation function to use in the encoder layers.
            optimizer (str): Optional. Optimizer to use for training.
            tolerance (float): Optional. Maximum relative change in the loss function within the last
                num_patience_epochs number of epochs below which early stopping is triggered.
            num_patience_epochs (int): Optional. Number of successive epochs over which early stopping criterion
                is evaluated.
            batch_norm (bool): Optional. Whether to use batch normalization during training.
            rescale_gradient (float): Optional. Rescale factor for gradient.
            clip_gradient (float): Optional. Maximum magnitude for each gradient component.
            weight_decay (float): Optional. Weight decay coefficient. Adds L2 regularization.
            learning_rate (float): Optional. Learning rate for the optimizer.
            **kwargs: base class keyword argument values.
        """

        super(NTM, self).__init__(role, train_instance_count,
                                  train_instance_type, **kwargs)
        self.num_topics = num_topics
        self.encoder_layers = encoder_layers
        self.epochs = epochs
        self.encoder_layers_activation = encoder_layers_activation
        self.optimizer = optimizer
        self.tolerance = tolerance
        self.num_patience_epochs = num_patience_epochs
        self.batch_norm = batch_norm
        self.rescale_gradient = rescale_gradient
        self.clip_gradient = clip_gradient
        self.weight_decay = weight_decay
        self.learning_rate = learning_rate

    def create_model(self):
        """Return a :class:`~sagemaker.amazon.NTMModel` referencing the latest
        s3 model data produced by this Estimator."""

        return NTMModel(self.model_data,
                        self.role,
                        sagemaker_session=self.sagemaker_session)

    def fit(self, records, mini_batch_size=None, **kwargs):
        if mini_batch_size is not None and (mini_batch_size < 1
                                            or mini_batch_size > 10000):
            raise ValueError("mini_batch_size must be in [1, 10000]")
        super(NTM, self).fit(records, mini_batch_size, **kwargs)
예제 #14
0
class LDA(AmazonAlgorithmEstimatorBase):

    repo_name = 'lda'
    repo_version = 1

    num_topics = hp('num_topics', gt(0), 'An integer greater than zero', int)
    alpha0 = hp('alpha0', gt(0), 'A positive float', float)
    max_restarts = hp('max_restarts', gt(0), 'An integer greater than zero',
                      int)
    max_iterations = hp('max_iterations', gt(0),
                        'An integer greater than zero', int)
    tol = hp('tol', gt(0), 'A positive float', float)

    def __init__(self,
                 role,
                 train_instance_type,
                 num_topics,
                 alpha0=None,
                 max_restarts=None,
                 max_iterations=None,
                 tol=None,
                 **kwargs):
        """Latent Dirichlet Allocation (LDA) is :class:`Estimator` used for unsupervised learning.

        Amazon SageMaker Latent Dirichlet Allocation is an unsupervised learning algorithm that attempts to describe
        a set of observations as a mixture of distinct categories. LDA is most commonly used to discover
        a user-specified number of topics shared by documents within a text corpus.
        Here each observation is a document, the features are the presence (or occurrence count) of each word, and
        the categories are the topics.

        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. It requires Amazon
        :class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3.
        There is an utility :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set` that
        can be used to upload data to S3 and creates :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed
        to the `fit` call.

        To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please
        consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
        Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an Endpoint,
        deploy returns a :class:`~sagemaker.amazon.lda.LDAPredictor` object that can be used
        for inference calls using the trained model hosted in the SageMaker Endpoint.

        LDA Estimators can be configured by setting hyperparameters. The available hyperparameters for
        LDA are documented below.

        For further information on the AWS LDA algorithm,
        please consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/lda.html

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
                APIs that create Amazon SageMaker endpoints use this role to access
                training data and model artifacts. After the endpoint is created,
                the inference code might use the IAM role, if accessing AWS resource.
            train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
            num_topics (int): The number of topics for LDA to find within the data.
            alpha0 (float): Optional. Initial guess for the concentration parameter
            max_restarts (int): Optional. The number of restarts to perform during the Alternating Least Squares (ALS)
                spectral decomposition phase of the algorithm.
            max_iterations (int): Optional. The maximum number of iterations to perform during the
                ALS phase of the algorithm.
            tol (float): Optional. Target error tolerance for the ALS phase of the algorithm.
            **kwargs: base class keyword argument values.
        """

        # this algorithm only supports single instance training
        super(LDA, self).__init__(role, 1, train_instance_type, **kwargs)
        self.num_topics = num_topics
        self.alpha0 = alpha0
        self.max_restarts = max_restarts
        self.max_iterations = max_iterations
        self.tol = tol

    def create_model(self):
        """Return a :class:`~sagemaker.amazon.LDAModel` referencing the latest
        s3 model data produced by this Estimator."""

        return LDAModel(self.model_data,
                        self.role,
                        sagemaker_session=self.sagemaker_session)

    def fit(self, records, mini_batch_size, **kwargs):
        # mini_batch_size is required, prevent explicit calls with None
        if mini_batch_size is None:
            raise ValueError("mini_batch_size must be set")
        super(LDA, self).fit(records, mini_batch_size, **kwargs)
예제 #15
0
class PCA(AmazonAlgorithmEstimatorBase):
    """An unsupervised machine learning algorithm to reduce feature dimensionality.

    As a result, number of features within a dataset is reduced but the dataset still
    retain as much information as possible.
    """

    repo_name = "pca"
    repo_version = 1

    DEFAULT_MINI_BATCH_SIZE = 500

    num_components = hp("num_components", gt(0),
                        "Value must be an integer greater than zero", int)
    algorithm_mode = hp(
        "algorithm_mode",
        isin("regular", "randomized"),
        'Value must be one of "regular" and "randomized"',
        str,
    )
    subtract_mean = hp(name="subtract_mean",
                       validation_message="Value must be a boolean",
                       data_type=bool)
    extra_components = hp(
        name="extra_components",
        validation_message=
        "Value must be an integer greater than or equal to 0, or -1.",
        data_type=int,
    )

    def __init__(self,
                 role,
                 instance_count=None,
                 instance_type=None,
                 num_components=None,
                 algorithm_mode=None,
                 subtract_mean=None,
                 extra_components=None,
                 **kwargs):
        """A Principal Components Analysis (PCA)
        :class:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase`.

        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit_ndarray`
        or
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`.
        The former allows a PCA model to be fit on a 2-dimensional numpy array.
        The latter requires Amazon :class:`~sagemaker.amazon.record_pb2.Record`
        protobuf serialized data to be stored in S3.

        To learn more about the Amazon protobuf Record class and how to
        prepare bulk data in this format, please consult AWS technical
        documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        After this Estimator is fit, model data is stored in S3. The model
        may be deployed to an Amazon SageMaker Endpoint by invoking
        :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as
        deploying an Endpoint, deploy returns a
        :class:`~sagemaker.amazon.pca.PCAPredictor` object that can be used to
        project input vectors to the learned lower-dimensional representation,
        using the trained PCA model hosted in the SageMaker Endpoint.

        PCA Estimators can be configured by setting hyperparameters. The
        available hyperparameters for PCA are documented below. For further
        information on the AWS PCA algorithm, please consult AWS technical
        documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/pca.html

        This Estimator uses Amazon SageMaker PCA to perform training and host
        deployed models. To learn more about Amazon SageMaker PCA, please read:
        https://docs.aws.amazon.com/sagemaker/latest/dg/how-pca-works.html

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon
                SageMaker training jobs and APIs that create Amazon SageMaker
                endpoints use this role to access training data and model
                artifacts. After the endpoint is created, the inference code
                might use the IAM role, if accessing AWS resource.
            instance_count (int): Number of Amazon EC2 instances to use
                for training.
            instance_type (str): Type of EC2 instance to use for training,
                for example, 'ml.c4.xlarge'.
            num_components (int): The number of principal components. Must be
                greater than zero.
            algorithm_mode (str): Mode for computing the principal components.
                One of 'regular' or 'randomized'.
            subtract_mean (bool): Whether the data should be unbiased both
                during train and at inference.
            extra_components (int): As the value grows larger, the solution
                becomes more accurate but the runtime and memory consumption
                increase linearly. If this value is unset or set to -1, then a
                default value equal to the maximum of 10 and num_components will
                be used. Valid for randomized mode only.
            **kwargs: base class keyword argument values.

        .. tip::

            You can find additional parameters for initializing this class at
            :class:`~sagemaker.estimator.amazon_estimator.AmazonAlgorithmEstimatorBase` and
            :class:`~sagemaker.estimator.EstimatorBase`.
        """
        super(PCA, self).__init__(role, instance_count, instance_type,
                                  **kwargs)
        self.num_components = num_components
        self.algorithm_mode = algorithm_mode
        self.subtract_mean = subtract_mean
        self.extra_components = extra_components

    def create_model(self, vpc_config_override=VPC_CONFIG_DEFAULT, **kwargs):
        """Return a :class:`~sagemaker.amazon.pca.PCAModel` referencing the
        latest s3 model data produced by this Estimator.

        Args:
            vpc_config_override (dict[str, list[str]]): Optional override for VpcConfig set on
                the model. Default: use subnets and security groups from this Estimator.
                * 'Subnets' (list[str]): List of subnet ids.
                * 'SecurityGroupIds' (list[str]): List of security group ids.
            **kwargs: Additional kwargs passed to the PCAModel constructor.
        """
        return PCAModel(self.model_data,
                        self.role,
                        sagemaker_session=self.sagemaker_session,
                        vpc_config=self.get_vpc_config(vpc_config_override),
                        **kwargs)

    def _prepare_for_training(self,
                              records,
                              mini_batch_size=None,
                              job_name=None):
        """Set hyperparameters needed for training.

        Args:
            records (:class:`~RecordSet`): The records to train this ``Estimator`` on.
            mini_batch_size (int or None): The size of each mini-batch to use when
                training. If ``None``, a default value will be used.
            job_name (str): Name of the training job to be created. If not
                specified, one is generated, using the base name given to the
                constructor if applicable.
        """
        num_records = None
        if isinstance(records, list):
            for record in records:
                if record.channel == "train":
                    num_records = record.num_records
                    break
            if num_records is None:
                raise ValueError("Must provide train channel.")
        else:
            num_records = records.num_records

        # mini_batch_size is a required parameter
        default_mini_batch_size = min(
            self.DEFAULT_MINI_BATCH_SIZE,
            max(1, int(num_records / self.instance_count)))
        use_mini_batch_size = mini_batch_size or default_mini_batch_size

        super(PCA,
              self)._prepare_for_training(records=records,
                                          mini_batch_size=use_mini_batch_size,
                                          job_name=job_name)
class AmazonAlgorithmEstimatorBase(EstimatorBase):
    """Base class for Amazon first-party Estimator implementations. This class isn't intended
    to be instantiated directly."""

    MAX_DEFAULT_BATCH_SIZE = 500

    feature_dim = hp('feature_dim', (validation.isint, validation.gt(0)))
    mini_batch_size = hp('mini_batch_size',
                         (validation.isint, validation.gt(0)))

    def __init__(self,
                 role,
                 train_instance_count,
                 train_instance_type,
                 data_location=None,
                 **kwargs):
        """Initialize an AmazonAlgorithmEstimatorBase.

        Args:
            data_location (str or None): The s3 prefix to upload RecordSet objects to, expressed as an
                S3 url. For example "s3://example-bucket/some-key-prefix/". Objects will be
                saved in a unique sub-directory of the specified location. If None, a default
                data location will be used."""
        super(AmazonAlgorithmEstimatorBase,
              self).__init__(role, train_instance_count, train_instance_type,
                             **kwargs)

        default_location = "s3://{}/sagemaker-record-sets/".format(
            self.sagemaker_session.default_bucket())
        data_location = data_location or default_location
        self.data_location = data_location

    def train_image(self):
        return registry(
            self.sagemaker_session.boto_region_name) + "/" + type(self).repo

    def hyperparameters(self):
        return hp.serialize_all(self)

    @property
    def data_location(self):
        return self._data_location

    @data_location.setter
    def data_location(self, data_location):
        if not data_location.startswith('s3://'):
            raise ValueError(
                'Expecting an S3 URL beginning with "s3://". Got "{}"'.format(
                    data_location))
        if data_location[-1] != '/':
            data_location = data_location + '/'
        self._data_location = data_location

    def fit(self, records, mini_batch_size=None, **kwargs):
        """Fit this Estimator on serialized Record objects, stored in S3.

        ``records`` should be an instance of :class:`~RecordSet`. This defines a collection of
        s3 data files to train this ``Estimator`` on.

        Training data is expected to be encoded as dense or sparse vectors in the "values" feature
        on each Record. If the data is labeled, the label is expected to be encoded as a list of
        scalas in the "values" feature of the Record label.

        More information on the Amazon Record format is available at:
        https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        See :meth:`~AmazonAlgorithmEstimatorBase.record_set` to construct a ``RecordSet`` object
        from :class:`~numpy.ndarray` arrays.

        Args:
            records (:class:`~RecordSet`): The records to train this ``Estimator`` on
            mini_batch_size (int or None): The size of each mini-batch to use when training. If None, a
                default value will be used.
        """
        default_mini_batch_size = min(
            self.MAX_DEFAULT_BATCH_SIZE,
            max(1, int(records.num_records / self.train_instance_count)))
        self.mini_batch_size = mini_batch_size or default_mini_batch_size
        self.feature_dim = records.feature_dim
        data = {
            records.channel:
            s3_input(records.s3_data,
                     distribution='ShardedByS3Key',
                     s3_data_type=records.s3_data_type)
        }
        super(AmazonAlgorithmEstimatorBase, self).fit(data, **kwargs)

    def record_set(self, train, labels=None, channel="train"):
        """Build a :class:`~RecordSet` from a numpy :class:`~ndarray` matrix and label vector.

        For the 2D ``ndarray`` ``train``, each row is converted to a :class:`~Record` object.
        The vector is stored in the "values" entry of the ``features`` property of each Record.
        If ``labels`` is not None, each corresponding label is assigned to the "values" entry
        of the ``labels`` property of each Record.

        The collection of ``Record`` objects are protobuf serialized and uploaded to new
        S3 locations. A manifest file is generated containing the list of objects created and
        also stored in S3.

        The number of S3 objects created is controlled by the ``train_instance_count`` property
        on this Estimator. One S3 object is created per training instance.

        Args:
            train (numpy.ndarray): A 2D numpy array of training data.
            labels (numpy.ndarray): A 1D numpy array of labels. Its length must be equal to the
               number of rows in ``train``.
            channel (str): The SageMaker TrainingJob channel this RecordSet should be assigned to.
        Returns:
            RecordSet: A RecordSet referencing the encoded, uploading training and label data.
        """
        s3 = self.sagemaker_session.boto_session.resource('s3')
        parsed_s3_url = urlparse(self.data_location)
        bucket, key_prefix = parsed_s3_url.netloc, parsed_s3_url.path
        key_prefix = key_prefix + '{}-{}/'.format(
            type(self).__name__, sagemaker_timestamp())
        key_prefix = key_prefix.lstrip('/')
        logger.debug('Uploading to bucket {} and key_prefix {}'.format(
            bucket, key_prefix))
        manifest_s3_file = upload_numpy_to_s3_shards(self.train_instance_count,
                                                     s3, bucket, key_prefix,
                                                     train, labels)
        logger.debug("Created manifest file {}".format(manifest_s3_file))
        return RecordSet(manifest_s3_file,
                         num_records=train.shape[0],
                         feature_dim=train.shape[1],
                         channel=channel)
예제 #17
0
class NTM(AmazonAlgorithmEstimatorBase):
    """An unsupervised learning algorithm used to organize a corpus of documents into topics.

    The resulting topics contain word groupings based on their statistical distribution.
    Documents that contain frequent occurrences of words such as "bike", "car", "train",
    "mileage", and "speed" are likely to share a topic on "transportation" for example.
    """

    repo_name = "ntm"
    repo_version = 1

    num_topics = hp("num_topics", (ge(2), le(1000)), "An integer in [2, 1000]", int)
    encoder_layers = hp(
        name="encoder_layers",
        validation_message="A comma separated list of " "positive integers",
        data_type=list,
    )
    epochs = hp("epochs", (ge(1), le(100)), "An integer in [1, 100]", int)
    encoder_layers_activation = hp(
        "encoder_layers_activation",
        isin("sigmoid", "tanh", "relu"),
        'One of "sigmoid", "tanh" or "relu"',
        str,
    )
    optimizer = hp(
        "optimizer",
        isin("adagrad", "adam", "rmsprop", "sgd", "adadelta"),
        'One of "adagrad", "adam", "rmsprop", "sgd" and "adadelta"',
        str,
    )
    tolerance = hp("tolerance", (ge(1e-6), le(0.1)), "A float in [1e-6, 0.1]", float)
    num_patience_epochs = hp("num_patience_epochs", (ge(1), le(10)), "An integer in [1, 10]", int)
    batch_norm = hp(name="batch_norm", validation_message="Value must be a boolean", data_type=bool)
    rescale_gradient = hp("rescale_gradient", (ge(1e-3), le(1.0)), "A float in [1e-3, 1.0]", float)
    clip_gradient = hp("clip_gradient", ge(1e-3), "A float greater equal to 1e-3", float)
    weight_decay = hp("weight_decay", (ge(0.0), le(1.0)), "A float in [0.0, 1.0]", float)
    learning_rate = hp("learning_rate", (ge(1e-6), le(1.0)), "A float in [1e-6, 1.0]", float)

    def __init__(
        self,
        role,
        instance_count=None,
        instance_type=None,
        num_topics=None,
        encoder_layers=None,
        epochs=None,
        encoder_layers_activation=None,
        optimizer=None,
        tolerance=None,
        num_patience_epochs=None,
        batch_norm=None,
        rescale_gradient=None,
        clip_gradient=None,
        weight_decay=None,
        learning_rate=None,
        **kwargs
    ):
        """Neural Topic Model (NTM) is :class:`Estimator` used for unsupervised
        learning.

        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`.
        It requires Amazon :class:`~sagemaker.amazon.record_pb2.Record` protobuf
        serialized data to be stored in S3. There is an utility
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set`
        that can be used to upload data to S3 and creates
        :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed to
        the `fit` call.

        To learn more about the Amazon protobuf Record class and how to
        prepare bulk data in this format, please consult AWS technical
        documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        After this Estimator is fit, model data is stored in S3. The model
        may be deployed to an Amazon SageMaker Endpoint by invoking
        :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as
        deploying an Endpoint, deploy returns a
        :class:`~sagemaker.amazon.ntm.NTMPredictor` object that can be used for
        inference calls using the trained model hosted in the SageMaker
        Endpoint.

        NTM Estimators can be configured by setting hyperparameters. The
        available hyperparameters for NTM are documented below.

        For further information on the AWS NTM algorithm, please consult AWS
        technical documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/ntm.html

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon
                SageMaker training jobs and APIs that create Amazon SageMaker
                endpoints use this role to access training data and model
                artifacts. After the endpoint is created, the inference code
                might use the IAM role, if accessing AWS resource.
            instance_count:
            instance_type (str): Type of EC2 instance to use for training,
                for example, 'ml.c4.xlarge'.
            num_topics (int): Required. The number of topics for NTM to find
                within the data.
            encoder_layers (list): Optional. Represents number of layers in the
                encoder and the output size of each layer.
            epochs (int): Optional. Maximum number of passes over the training
                data.
            encoder_layers_activation (str): Optional. Activation function to
                use in the encoder layers.
            optimizer (str): Optional. Optimizer to use for training.
            tolerance (float): Optional. Maximum relative change in the loss
                function within the last num_patience_epochs number of epochs
                below which early stopping is triggered.
            num_patience_epochs (int): Optional. Number of successive epochs
                over which early stopping criterion is evaluated.
            batch_norm (bool): Optional. Whether to use batch normalization
                during training.
            rescale_gradient (float): Optional. Rescale factor for gradient.
            clip_gradient (float): Optional. Maximum magnitude for each gradient
                component.
            weight_decay (float): Optional. Weight decay coefficient. Adds L2
                regularization.
            learning_rate (float): Optional. Learning rate for the optimizer.
            **kwargs: base class keyword argument values.

        .. tip::

            You can find additional parameters for initializing this class at
            :class:`~sagemaker.estimator.amazon_estimator.AmazonAlgorithmEstimatorBase` and
            :class:`~sagemaker.estimator.EstimatorBase`.
        """

        super(NTM, self).__init__(role, instance_count, instance_type, **kwargs)
        self.num_topics = num_topics
        self.encoder_layers = encoder_layers
        self.epochs = epochs
        self.encoder_layers_activation = encoder_layers_activation
        self.optimizer = optimizer
        self.tolerance = tolerance
        self.num_patience_epochs = num_patience_epochs
        self.batch_norm = batch_norm
        self.rescale_gradient = rescale_gradient
        self.clip_gradient = clip_gradient
        self.weight_decay = weight_decay
        self.learning_rate = learning_rate

    def create_model(self, vpc_config_override=VPC_CONFIG_DEFAULT, **kwargs):
        """Return a :class:`~sagemaker.amazon.NTMModel` referencing the latest
        s3 model data produced by this Estimator.

        Args:
            vpc_config_override (dict[str, list[str]]): Optional override for VpcConfig set on
                the model. Default: use subnets and security groups from this Estimator.
                * 'Subnets' (list[str]): List of subnet ids.
                * 'SecurityGroupIds' (list[str]): List of security group ids.
            **kwargs: Additional kwargs passed to the NTMModel constructor.
        """
        return NTMModel(
            self.model_data,
            self.role,
            sagemaker_session=self.sagemaker_session,
            vpc_config=self.get_vpc_config(vpc_config_override),
            **kwargs
        )

    def _prepare_for_training(  # pylint: disable=signature-differs
        self, records, mini_batch_size, job_name=None
    ):
        """
        Args:
            records:
            mini_batch_size:
            job_name:
        """
        if mini_batch_size is not None and (mini_batch_size < 1 or mini_batch_size > 10000):
            raise ValueError("mini_batch_size must be in [1, 10000]")
        super(NTM, self)._prepare_for_training(
            records, mini_batch_size=mini_batch_size, job_name=job_name
        )
예제 #18
0
class LinearLearner(AmazonAlgorithmEstimatorBase):

    repo = 'linear-learner:1'

    DEFAULT_MINI_BATCH_SIZE = 1000

    binary_classifier_model_selection_criteria = hp(
        'binary_classifier_model_selection_criteria',
        isin('accuracy', 'f1', 'precision_at_target_recall',
             'recall_at_target_precision', 'cross_entropy_loss'),
        data_type=str)
    target_recall = hp('target_recall', (gt(0), lt(1)), "A float in (0,1)",
                       float)
    target_precision = hp('target_precision', (gt(0), lt(1)),
                          "A float in (0,1)", float)
    positive_example_weight_mult = hp('positive_example_weight_mult', gt(0),
                                      "A float greater than 0", float)
    epochs = hp('epochs', gt(0), "An integer greater-than 0", int)
    predictor_type = hp('predictor_type', isin('binary_classifier',
                                               'regressor'),
                        'One of "binary_classifier" or "regressor"', str)
    use_bias = hp('use_bias', (), "Either True or False", bool)
    num_models = hp('num_models', gt(0), "An integer greater-than 0", int)
    num_calibration_samples = hp('num_calibration_samples', gt(0),
                                 "An integer greater-than 0", int)
    init_method = hp('init_method', isin('uniform', 'normal'),
                     'One of "uniform" or "normal"', str)
    init_scale = hp('init_scale', (gt(-1), lt(1)), 'A float in (-1, 1)', float)
    init_sigma = hp('init_sigma', (gt(0), lt(1)), 'A float in (0, 1)', float)
    init_bias = hp('init_bias', (), 'A number', float)
    optimizer = hp('optimizer', isin('sgd', 'adam', 'auto'),
                   'One of "sgd", "adam" or "auto', str)
    loss = hp('loss', isin('logistic', 'squared_loss', 'absolute_loss',
                           'auto'),
              '"logistic", "squared_loss", "absolute_loss" or"auto"', str)
    wd = hp('wd', (gt(0), lt(1)), 'A float in (0,1)', float)
    l1 = hp('l1', (gt(0), lt(1)), 'A float in (0,1)', float)
    momentum = hp('momentum', (gt(0), lt(1)), 'A float in (0,1)', float)
    learning_rate = hp('learning_rate', (gt(0), lt(1)), 'A float in (0,1)',
                       float)
    beta_1 = hp('beta_1', (gt(0), lt(1)), 'A float in (0,1)', float)
    beta_2 = hp('beta_1', (gt(0), lt(1)), 'A float in (0,1)', float)
    bias_lr_mult = hp('bias_lr_mult', gt(0), 'A float greater-than 0', float)
    bias_wd_mult = hp('bias_wd_mult', gt(0), 'A float greater-than 0', float)
    use_lr_scheduler = hp('use_lr_scheduler', (), 'A boolean', bool)
    lr_scheduler_step = hp('lr_scheduler_step', gt(0),
                           'An integer greater-than 0', int)
    lr_scheduler_factor = hp('lr_scheduler_factor', (gt(0), lt(1)),
                             'A float in (0,1)', float)
    lr_scheduler_minimum_lr = hp('lr_scheduler_minimum_lr', gt(0),
                                 'A float greater-than 0', float)
    normalize_data = hp('normalize_data', (), 'A boolean', bool)
    normalize_label = hp('normalize_label', (), 'A boolean', bool)
    unbias_data = hp('unbias_data', (), 'A boolean', bool)
    unbias_label = hp('unbias_label', (), 'A boolean', bool)
    num_point_for_scalar = hp('num_point_for_scalar', gt(0),
                              'An integer greater-than 0', int)

    def __init__(self,
                 role,
                 train_instance_count,
                 train_instance_type,
                 predictor_type='binary_classifier',
                 binary_classifier_model_selection_criteria=None,
                 target_recall=None,
                 target_precision=None,
                 positive_example_weight_mult=None,
                 epochs=None,
                 use_bias=None,
                 num_models=None,
                 num_calibration_samples=None,
                 init_method=None,
                 init_scale=None,
                 init_sigma=None,
                 init_bias=None,
                 optimizer=None,
                 loss=None,
                 wd=None,
                 l1=None,
                 momentum=None,
                 learning_rate=None,
                 beta_1=None,
                 beta_2=None,
                 bias_lr_mult=None,
                 bias_wd_mult=None,
                 use_lr_scheduler=None,
                 lr_scheduler_step=None,
                 lr_scheduler_factor=None,
                 lr_scheduler_minimum_lr=None,
                 normalize_data=None,
                 normalize_label=None,
                 unbias_data=None,
                 unbias_label=None,
                 num_point_for_scalar=None,
                 **kwargs):
        """An :class:`Estimator` for binary classification and regression.

        Amazon SageMaker Linear Learner provides a solution for both classification and regression problems, allowing
        for exploring different training objectives simultaneously and choosing the best solution from a validation set.
        It allows the user to explore a large number of models and choose the best, which optimizes either continuous
        objectives such as mean square error, cross entropy loss, absolute error, etc., or discrete objectives suited
        for classification such as F1 measure, precision@recall, accuracy. The implementation provides a significant
        speedup over naive hyperparameter optimization techniques and an added convenience, when compared with
        solutions providing a solution only to continuous objectives.

        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit_ndarray`
        or :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. The former allows a
        LinearLearner model to be fit on a 2-dimensional numpy array. The latter requires Amazon
        :class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3.

        To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please
        consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
        Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an Endpoint,
        ``deploy`` returns a :class:`~sagemaker.amazon.linear_learner.LinearLearnerPredictor` object that can be used
        to make class or regression predictions, using the trained model.

        LinearLearner Estimators can be configured by setting hyperparameters. The available hyperparameters for
        LinearLearner are documented below. For further information on the AWS LinearLearner algorithm, please consult
        AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/linear-learner.html

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
                APIs that create Amazon SageMaker endpoints use this role to access
                training data and model artifacts. After the endpoint is created,
                the inference code might use the IAM role, if accessing AWS resource.
            train_instance_count (int): Number of Amazon EC2 instances to use for training.
            train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
            predictor_type (str): The type of predictor to learn. Either "binary_classifier" or "regressor".
            binary_classifier_model_selection_criteria (str): One of 'accuracy', 'f1', 'precision_at_target_recall',
            'recall_at_target_precision', 'cross_entropy_loss'
            target_recall (float): Target recall. Only applicable if binary_classifier_model_selection_criteria is
                precision_at_target_recall.
            target_precision (float): Target precision. Only applicable if binary_classifier_model_selection_criteria
                is recall_at_target_precision.
            positive_example_weight_mult (float): The importance weight of positive examples is multiplied by this
                constant. Useful for skewed datasets. Only applies for classification tasks.
            epochs (int): The maximum number of passes to make over the training data.
            use_bias (bool): Whether to include a bias field
            num_models (int): Number of models to train in parallel. If not set, the number of parallel models to
                train will be decided by the algorithm itself. One model will be trained according to the given training
            parameter (regularization, optimizer, loss) and the rest by close by parameters.
            num_calibration_samples (int): Number of observations to use from validation dataset for doing model
            calibration (finding the best threshold).
            init_method (str): Function to use to set the initial model weights. One of "uniform" or "normal"
            init_scale (float): For "uniform" init, the range of values.
            init_sigma (float): For "normal" init, the standard-deviation.
            init_bias (float):  Initial weight for bias term
            optimizer (str): One of 'sgd', 'adam' or 'auto'
            loss (str): One of  'logistic', 'squared_loss', 'absolute_loss' or 'auto'
            wd (float): L2 regularization parameter i.e. the weight decay parameter. Use 0 for no L2 regularization.
            l1 (float): L1 regularization parameter. Use 0 for no L1 regularization.
            momentum (float): Momentum parameter of sgd optimizer.
            learning_rate (float): The SGD learning rate
            beta_1 (float): Exponential decay rate for first moment estimates. Only applies for adam optimizer.
            beta_2 (float): Exponential decay rate for second moment estimates. Only applies for adam optimizer.
            bias_lr_mult (float): Allows different learning rate for the bias term. The actual learning rate for the
            bias is learning rate times bias_lr_mult.
            bias_wd_mult (float): Allows different regularization for the bias term. The actual L2 regularization weight
            for the bias is wd times bias_wd_mult. By default there is no regularization on the bias term.
            use_lr_scheduler (bool): If true, we use a scheduler for the learning rate.
            lr_scheduler_step (int): The number of steps between decreases of the learning rate. Only applies to
                learning rate scheduler.
            lr_scheduler_factor (float): Every lr_scheduler_step the learning rate will decrease by this quantity.
                Only applies for learning rate scheduler.
            lr_scheduler_minimum_lr (float): The learning rate will never decrease to a value lower than this.
            lr_scheduler_minimum_lr (float): Only applies for learning rate scheduler.
            normalize_data (bool): Normalizes the features before training to have standard deviation of 1.0.
            normalize_label (bool): Normalizes the regression label to have a standard deviation of 1.0.
                If set for classification, it will be ignored.
            unbias_data (bool): If true, features are modified to have mean 0.0.
            ubias_label (bool): If true, labels are modified to have mean 0.0.
            num_point_for_scaler (int): The number of data points to use for calculating the normalizing  and
                unbiasing terms.
            **kwargs: base class keyword argument values.
        """
        super(LinearLearner, self).__init__(role, train_instance_count,
                                            train_instance_type, **kwargs)
        self.predictor_type = predictor_type
        self.binary_classifier_model_selection_criteria = binary_classifier_model_selection_criteria
        self.target_recall = target_recall
        self.target_precision = target_precision
        self.positive_example_weight_mult = positive_example_weight_mult
        self.epochs = epochs
        self.use_bias = use_bias
        self.num_models = num_models
        self.num_calibration_samples = num_calibration_samples
        self.init_method = init_method
        self.init_scale = init_scale
        self.init_sigma = init_sigma
        self.init_bias = init_bias
        self.optimizer = optimizer
        self.loss = loss
        self.wd = wd
        self.l1 = l1
        self.momentum = momentum
        self.learning_rate = learning_rate
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.bias_lr_mult = bias_lr_mult
        self.bias_wd_mult = bias_wd_mult
        self.use_lr_scheduler = use_lr_scheduler
        self.lr_scheduler_step = lr_scheduler_step
        self.lr_scheduler_factor = lr_scheduler_factor
        self.lr_scheduler_minimum_lr = lr_scheduler_minimum_lr
        self.normalize_data = normalize_data
        self.normalize_label = normalize_label
        self.unbias_data = unbias_data
        self.ubias_label = unbias_label
        self.num_point_for_scaler = num_point_for_scalar

    def create_model(self):
        """Return a :class:`~sagemaker.amazon.kmeans.LinearLearnerModel` referencing the latest
        s3 model data produced by this Estimator."""

        return LinearLearnerModel(self, self.model_data, self.role,
                                  self.sagemaker_session)

    def fit(self, records, mini_batch_size=None, **kwargs):
        # mini_batch_size can't be greater than number of records or training job fails
        default_mini_batch_size = min(
            self.DEFAULT_MINI_BATCH_SIZE,
            max(1, int(records.num_records / self.train_instance_count)))
        use_mini_batch_size = mini_batch_size or default_mini_batch_size
        super(LinearLearner, self).fit(records, use_mini_batch_size, **kwargs)
예제 #19
0
class RandomCutForest(AmazonAlgorithmEstimatorBase):
    """An unsupervised algorithm for detecting anomalous data points within a data set.

    These are observations which diverge from otherwise well-structured or patterned data.
    Anomalies can manifest as unexpected spikes in time series data, breaks in periodicity,
    or unclassifiable data points."""

    repo_name = "randomcutforest"
    repo_version = 1
    MINI_BATCH_SIZE = 1000

    eval_metrics = hp(
        name="eval_metrics",
        validation_message=
        'A comma separated list of "accuracy" or "precision_recall_fscore"',
        data_type=list,
    )

    num_trees = hp("num_trees", (ge(50), le(1000)), "An integer in [50, 1000]",
                   int)
    num_samples_per_tree = hp("num_samples_per_tree", (ge(1), le(2048)),
                              "An integer in [1, 2048]", int)
    feature_dim = hp("feature_dim", (ge(1), le(10000)),
                     "An integer in [1, 10000]", int)

    def __init__(self,
                 role,
                 instance_count,
                 instance_type,
                 num_samples_per_tree=None,
                 num_trees=None,
                 eval_metrics=None,
                 **kwargs):
        """An `Estimator` class implementing a Random Cut Forest.

        Typically used for anomaly detection, this Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`.
        It requires Amazon :class:`~sagemaker.amazon.record_pb2.Record` protobuf
        serialized data to be stored in S3. There is an utility
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set`
        that can be used to upload data to S3 and creates
        :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed to
        the `fit` call.

        To learn more about the Amazon protobuf Record class and how to
        prepare bulk data in this format, please consult AWS technical
        documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

        After this Estimator is fit, model data is stored in S3. The model
        may be deployed to an Amazon SageMaker Endpoint by invoking
        :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as
        deploying an Endpoint, deploy returns a
        :class:`~sagemaker.amazon.ntm.RandomCutForestPredictor` object that can
        be used for inference calls using the trained model hosted in the
        SageMaker Endpoint.

        RandomCutForest Estimators can be configured by setting
        hyperparameters. The available hyperparameters for RandomCutForest are
        documented below.

        For further information on the AWS Random Cut Forest algorithm,
        please consult AWS technical documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/randomcutforest.html

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon
                SageMaker training jobs and APIs that create Amazon SageMaker
                endpoints use this role to access training data and model
                artifacts. After the endpoint is created, the inference code
                might use the IAM role, if accessing AWS resource.
            instance_count (int): Number of Amazon EC2 instances to use
                for training.
            instance_type (str): Type of EC2 instance to use for training,
                for example, 'ml.c4.xlarge'.
            num_samples_per_tree (int): Optional. The number of samples used to
                build each tree in the forest. The total number of samples drawn
                from the train dataset is num_trees * num_samples_per_tree.
            num_trees (int): Optional. The number of trees used in the forest.
            eval_metrics (list): Optional. JSON list of metrics types to be used
                for reporting the score for the model. Allowed values are
                "accuracy", "precision_recall_fscore": positive and negative
                precision, recall, and f1 scores. If test data is provided, the
                score shall be reported in terms of all requested metrics.
            **kwargs: base class keyword argument values.

        .. tip::

            You can find additional parameters for initializing this class at
            :class:`~sagemaker.estimator.amazon_estimator.AmazonAlgorithmEstimatorBase` and
            :class:`~sagemaker.estimator.EstimatorBase`.
        """

        super(RandomCutForest, self).__init__(role, instance_count,
                                              instance_type, **kwargs)
        self.num_samples_per_tree = num_samples_per_tree
        self.num_trees = num_trees
        self.eval_metrics = eval_metrics

    def create_model(self, vpc_config_override=VPC_CONFIG_DEFAULT, **kwargs):
        """Return a :class:`~sagemaker.amazon.RandomCutForestModel` referencing
        the latest s3 model data produced by this Estimator.

        Args:
            vpc_config_override (dict[str, list[str]]): Optional override for VpcConfig set on
                the model. Default: use subnets and security groups from this Estimator.
                * 'Subnets' (list[str]): List of subnet ids.
                * 'SecurityGroupIds' (list[str]): List of security group ids.
            **kwargs: Additional kwargs passed to the RandomCutForestModel constructor.
        """
        return RandomCutForestModel(
            self.model_data,
            self.role,
            sagemaker_session=self.sagemaker_session,
            vpc_config=self.get_vpc_config(vpc_config_override),
            **kwargs)

    def _prepare_for_training(self,
                              records,
                              mini_batch_size=None,
                              job_name=None):
        """
        Args:
            records:
            mini_batch_size:
            job_name:
        """
        if mini_batch_size is None:
            mini_batch_size = self.MINI_BATCH_SIZE
        elif mini_batch_size != self.MINI_BATCH_SIZE:
            raise ValueError(
                "Random Cut Forest uses a fixed mini_batch_size of {}".format(
                    self.MINI_BATCH_SIZE))

        super(RandomCutForest,
              self)._prepare_for_training(records,
                                          mini_batch_size=mini_batch_size,
                                          job_name=job_name)
예제 #20
0
class KNN(AmazonAlgorithmEstimatorBase):
    repo_name = 'knn'
    repo_version = 1

    k = hp('k', (ge(1)), 'An integer greater than 0', int)
    sample_size = hp('sample_size', (ge(1)), 'An integer greater than 0', int)
    predictor_type = hp('predictor_type', isin('classifier', 'regressor'),
                        'One of "classifier" or "regressor"', str)
    dimension_reduction_target = hp(
        'dimension_reduction_target', (ge(1)),
        'An integer greater than 0 and less than feature_dim', int)
    dimension_reduction_type = hp('dimension_reduction_type',
                                  isin('sign', 'fjlt'),
                                  'One of "sign" or "fjlt"', str)
    index_metric = hp('index_metric', isin('COSINE', 'INNER_PRODUCT', 'L2'),
                      'One of "COSINE", "INNER_PRODUCT", "L2"', str)
    index_type = hp('index_type',
                    isin('faiss.Flat', 'faiss.IVFFlat', 'faiss.IVFPQ'),
                    'One of "faiss.Flat", "faiss.IVFFlat", "faiss.IVFPQ"', str)
    faiss_index_ivf_nlists = hp('faiss_index_ivf_nlists', (),
                                '"auto" or an integer greater than 0', str)
    faiss_index_pq_m = hp('faiss_index_pq_m', (ge(1)),
                          'An integer greater than 0', int)

    def __init__(self,
                 role,
                 train_instance_count,
                 train_instance_type,
                 k,
                 sample_size,
                 predictor_type,
                 dimension_reduction_type=None,
                 dimension_reduction_target=None,
                 index_type=None,
                 index_metric=None,
                 faiss_index_ivf_nlists=None,
                 faiss_index_pq_m=None,
                 **kwargs):
        """k-nearest neighbors (KNN) is :class:`Estimator` used for classification and regression.
        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. It requires Amazon
        :class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3.
        There is an utility :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set` that
        can be used to upload data to S3 and creates :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed
        to the `fit` call.
        To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please
        consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html
        After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
        Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an Endpoint,
        deploy returns a :class:`~sagemaker.amazon.knn.KNNPredictor` object that can be used
        for inference calls using the trained model hosted in the SageMaker Endpoint.
        KNN Estimators can be configured by setting hyperparameters. The available hyperparameters for
        KNN are documented below.
        For further information on the AWS KNN algorithm,
        please consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/knn.html
        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
                APIs that create Amazon SageMaker endpoints use this role to access
                training data and model artifacts. After the endpoint is created,
                the inference code might use the IAM role, if accessing AWS resource.
            train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
            k (int): Required. Number of nearest neighbors.
            sample_size(int): Required. Number of data points to be sampled from the training data set.
            predictor_type (str): Required. Type of inference to use on the data's labels,
                allowed values are 'classifier' and 'regressor'.
            dimension_reduction_type (str): Optional. Type of dimension reduction technique to use.
                Valid values: "sign", "fjlt"
            dimension_reduction_target (int): Optional. Target dimension to reduce to. Required when
                dimension_reduction_type is specified.
            index_type (str): Optional. Type of index to use. Valid values are
                "faiss.Flat", "faiss.IVFFlat", "faiss.IVFPQ".
            index_metric(str): Optional. Distance metric to measure between points when finding nearest neighbors.
                Valid values are "COSINE", "INNER_PRODUCT", "L2"
            faiss_index_ivf_nlists(str): Optional. Number of centroids to construct in the index if
                index_type is "faiss.IVFFlat" or "faiss.IVFPQ".
            faiss_index_pq_m(int): Optional. Number of vector sub-components to construct in the index,
                if index_type is "faiss.IVFPQ".
            **kwargs: base class keyword argument values.
        """

        super(KNN, self).__init__(role, train_instance_count,
                                  train_instance_type, **kwargs)
        self.k = k
        self.sample_size = sample_size
        self.predictor_type = predictor_type
        self.dimension_reduction_type = dimension_reduction_type
        self.dimension_reduction_target = dimension_reduction_target
        self.index_type = index_type
        self.index_metric = index_metric
        self.faiss_index_ivf_nlists = faiss_index_ivf_nlists
        self.faiss_index_pq_m = faiss_index_pq_m
        if dimension_reduction_type and not dimension_reduction_target:
            raise ValueError(
                '"dimension_reduction_target" is required when "dimension_reduction_type" is set.'
            )

    def create_model(self, vpc_config_override=VPC_CONFIG_DEFAULT):
        """Return a :class:`~sagemaker.amazon.KNNModel` referencing the latest
        s3 model data produced by this Estimator.

        Args:
            vpc_config_override (dict[str, list[str]]): Optional override for VpcConfig set on the model.
                Default: use subnets and security groups from this Estimator.
                * 'Subnets' (list[str]): List of subnet ids.
                * 'SecurityGroupIds' (list[str]): List of security group ids.
        """
        return KNNModel(self.model_data,
                        self.role,
                        sagemaker_session=self.sagemaker_session,
                        vpc_config=self.get_vpc_config(vpc_config_override))

    def _prepare_for_training(self,
                              records,
                              mini_batch_size=None,
                              job_name=None):
        super(KNN, self)._prepare_for_training(records,
                                               mini_batch_size=mini_batch_size,
                                               job_name=job_name)
class ImageClassification(AmazonS3AlgorithmEstimatorBase):
    repo = 'image-classification:latest'

    num_classes = hp('num_classes', (gt(1)),
                     'num_classes should be an integer greater-than 1', int)
    num_training_samples = hp(
        'num_training_samples', (gt(1)),
        'num_training_samples should be an integer greater-than 1', int)
    use_pretrained_model = hp(
        'use_pretrained_model', (isin(0, 1), ),
        'use_pretrained_model should be in the set, [0,1]', int)
    checkpoint_frequency = hp(
        'checkpoint_frequency', (ge(1), ),
        'checkpoint_frequency should be an integer greater-than 1', int)
    num_layers = hp(
        'num_layers', (isin(18, 34, 50, 101, 152, 200, 20, 32, 44, 56, 110), ),
        'num_layers should be in the set [18, 34, 50, 101, 152, 200, 20, 32, 44, 56, 110]',
        int)
    resize = hp('resize', (gt(1)),
                'resize should be an integer greater-than 1', int)
    epochs = hp('epochs', (ge(1)),
                'epochs should be an integer greater-than 1', int)
    learning_rate = hp(
        'learning_rate', (gt(0)),
        'learning_rate should be a floating point greater than 0', float)
    lr_scheduler_factor = hp(
        'lr_scheduler_factor', (gt(0)),
        'lr_schedule_factor should be a floating point greater than 0', float)
    lr_scheduler_step = hp('lr_scheduler_step', (),
                           'lr_scheduler_step should be a string input.', str)
    optimizer = hp(
        'optimizer', (isin('sgd', 'adam', 'rmsprop', 'nag')),
        'Should be one optimizer among the list sgd, adam, rmsprop, or nag.',
        str)
    momentum = hp('momentum', (ge(0), le(1)),
                  'momentum is expected in the range 0, 1', float)
    weight_decay = hp('weight_decay', (ge(0), le(1)),
                      'weight_decay in range 0 , 1 ', float)
    beta_1 = hp('beta_1', (ge(0), le(1)), 'beta_1 should be in range 0, 1',
                float)
    beta_2 = hp('beta_2', (ge(0), le(1)), 'beta_2 should be in the range 0, 1',
                float)
    eps = hp('eps', (gt(0), le(1)), 'eps should be in the range 0, 1', float)
    gamma = hp('gamma', (ge(0), le(1)), 'gamma should be in the range 0, 1',
               float)
    mini_batch_size = hp(
        'mini_batch_size', (gt(0)),
        'mini_batch_size should be an integer greater than 0', int)
    image_shape = hp('image_shape', (),
                     'image_shape is expected to be a string', str)
    augmentation_type = hp(
        'augmentation_type',
        (isin('crop', 'crop_color', 'crop_color_transform')),
        'augmentation type must be from one option offered', str)
    top_k = hp('top_k', (ge(1)), 'top_k should be greater than or equal to 1',
               int)
    kv_store = hp('kv_store', (isin('dist_sync', 'dist_async')),
                  'Can be dist_sync or dist_async', str)

    def __init__(self,
                 role,
                 train_instance_count,
                 train_instance_type,
                 num_classes,
                 num_training_samples,
                 resize=None,
                 lr_scheduler_step=None,
                 use_pretrained_model=0,
                 checkpoint_frequency=1,
                 num_layers=18,
                 epochs=30,
                 learning_rate=0.1,
                 lr_schedule_factor=0.1,
                 optimizer='sgd',
                 momentum=0.,
                 weight_decay=0.0001,
                 beta_1=0.9,
                 beta_2=0.999,
                 eps=1e-8,
                 gamma=0.9,
                 mini_batch_size=32,
                 image_shape='3,224,224',
                 augmentation_type=None,
                 top_k=None,
                 kv_store=None,
                 **kwargs):
        """
        An Image classification algorithm :class:`~sagemaker.amazon.AmazonAlgorithmEstimatorBase`.

        This Estimator may be fit via calls to
        :meth:`~sagemaker.amazon.amazon_estimator.AmazonS3AlgorithmEstimatorBase.fit`

        After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
        Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an Endpoint,
        ``deploy`` returns a :class:`~sagemaker.amazon.kmeans.ImageClassificationPredictor` object that can be used to
        label assignment, using the trained model hosted in the SageMaker Endpoint.

        ImageClassification Estimators can be configured by setting hyperparameters. The available hyperparameters for
        ImageClassification are documented below. For further information on the AWS ImageClassification algorithm,
        please consult AWS technical documentation:
        https://docs.aws.amazon.com/sagemaker/latest/dg/IC-Hyperparameter.html

        Args:
            role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
                APIs that create Amazon SageMaker endpoints use this role to access
                training data and model artifacts. After the endpoint is created,
                the inference code might use the IAM role, if accessing AWS resource.
            train_instance_count (int): Number of Amazon EC2 instances to use for training.
            train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
            num_classes (int): Number of output classes. This parameter defines the dimensions of the network output
                         and is typically set to the number of classes in the dataset.
            num_training_samples (int): Number of training examples in the input dataset. If there is a
                                mismatch between this value and the number of samples in the training
                                set, then the behavior of the lr_scheduler_step parameter is undefined
                                and distributed training accuracy might be affected.
            use_pretrained_model (int): Flag to indicate whether to use pre-trained model for training.
                                If set to `1`, then the pretrained model with the corresponding number
                                of layers is loaded and used for training. Only the top FC layer are
                                reinitialized with random weights. Otherwise, the network is trained from scratch.
                                Default value: 0
            checkpoint_frequency (int): Period to store model parameters (in number of epochs). Default value: 1
            num_layers (int): Number of layers for the network. For data with large image size (for example, 224x224 -
                              like ImageNet), we suggest selecting the number of layers from the set [18, 34, 50, 101,
                              152, 200]. For data with small image size (for example, 28x28 - like CFAR), we suggest
                              selecting the number of layers from the set [20, 32, 44, 56, 110]. The number of layers
                              in each set is based on the ResNet paper. For transfer learning, the number of layers
                              defines the architecture of base network and hence can only be selected from the set
                              [18, 34, 50, 101, 152, 200]. Default value: 152
            resize (int): Resize the image before using it for training. The images are resized so that the shortest
                          side is of this parameter. If the parameter is not set, then the training data is used as such
                          without resizing.
                          Note: This option is available only for inputs specified as application/x-image content-type
                          in training and validation channels.
            epochs (int): Number of training epochs. Default value: 30
            learning_rate (float): Initial learning rate. Float. Range in [0, 1]. Default value: 0.1
            lr_scheduler_factor (flaot): The ratio to reduce learning rate used in conjunction with the
                                `lr_scheduler_step` parameter, defined as `lr_new=lr_old * lr_scheduler_factor`.
                                Valid values: Float. Range in [0, 1]. Default value: 0.1
            lr_scheduler_step (str): The epochs at which to reduce the learning rate. As explained in the
                                ``lr_scheduler_factor`` parameter, the learning rate is reduced by
                                ``lr_scheduler_factor`` at these epochs. For example, if the value is set
                                to "10, 20", then the learning rate is reduced by ``lr_scheduler_factor`` after 10th
                                epoch and again by ``lr_scheduler_factor`` after 20th epoch. The epochs are delimited
                                by ",".
            optimizer (str): The optimizer types. For more details of the parameters for the optimizers, please refer to
                             MXNet's API. Valid values: One of sgd, adam, rmsprop, or nag. Default value: `sgd`.
            momentum (float): The momentum for sgd and nag, ignored for other optimizers. Valid values: Float. Range in
                              [0, 1]. Default value: 0
            weight_decay (float): The coefficient weight decay for sgd and nag, ignored for other optimizers.
                                  Range in [0, 1]. Default value: 0.0001
            beta_1 (float): The beta1 for adam, in other words, exponential decay rate for the first moment estimates.
                            Range in [0, 1]. Default value: 0.9
            beta_2 (float): The beta2 for adam, in other words, exponential decay rate for the second moment estimates.
                            Range in [0, 1]. Default value: 0.999
            eps	(float): The epsilon for adam and rmsprop. It is usually set to a small value to avoid division by 0.
                         Range in [0, 1]. Default value: 1e-8
            gamma (float): The gamma for rmsprop. A decay factor of moving average of the squared gradient.
                           Range in [0, 1]. Default value: 0.9
            mini_batch_size (int): The batch size for training. In a single-machine multi-GPU setting, each GPU handles
                                   mini_batch_size/num_gpu training samples. For the multi-machine training in
                                   dist_sync mode, the actual batch size is mini_batch_size*number of machines.
                                   See MXNet docs for more details. Default value: 32
            image_shape	(str): The input image dimensions, which is the same size as the input layer of the network. \
                                The format is defined as 'num_channels, height, width'. The image dimension can take on
                                any value as the network can handle varied dimensions of the input. However, there may
                                be memory constraints if a larger image dimension is used. Typical image dimensions for
                                image classification are '3, 224, 224'. This is similar to the ImageNet dataset.
                                Default value: ‘3, 224, 224’
            augmentation_type (str): Data augmentation type. The input images can be augmented in multiple ways as
                                      specified below.
                                'crop' - Randomly crop the image and flip the image horizontally
                                'crop_color' - In addition to ‘crop’, three random values in the range [-36, 36],
                                            [-50, 50], and [-50, 50]
                                            are added to the corresponding Hue-Saturation-Lightness channels resptly.
                                'crop_color_transform': In addition to crop_color, random transformations, including
                                            rotation, shear, and aspect ratio variations are applied to the image.
                                            The maximum angle of rotation is 10 degrees, the maximum shear ratio is 0.1,
                                            and the maximum aspect changing ratio is 0.25.
            top_k (int): Report the top-k accuracy during training. This parameter has to be greater than 1,
                            since the top-1 training accuracy is the same as the regular training accuracy that has
                            already been reported.
            kv_store (str): Weight update synchronization mode during distributed training. The weight updates can be
                            updated either synchronously  or asynchronously across machines. Synchronous updates
                            typically provide better accuracy than asynchronous updates but can be slower.
                            See distributed training in MXNet for more details. This parameter is not applicable
                            to single machine training.
                            'dist_sync' -  The gradients are synchronized after every batch with all the workers.
                                            With dist_sync,
                                     batch-size now means the batch size used on each machine. So if there are n
                                     machines and we use
                                     batch size b, then dist_sync behaves like local with batch size n*b
                            'dist_async'- Performs asynchronous updates. The weights are updated whenever gradients
                                     are received from any machine and the weight updates are atomic. However, the
                                     order is not guaranteed.
            **kwargs: base class keyword argument values.
        """
        super(ImageClassification,
              self).__init__(role,
                             train_instance_count,
                             train_instance_type,
                             algorithm='image_classification',
                             **kwargs)
        self.num_classes = num_classes
        self.num_training_samples = num_training_samples
        self.resize = resize
        self.lr_scheduler_step = lr_scheduler_step
        self.use_pretrained_model = use_pretrained_model
        self.checkpoint_frequency = checkpoint_frequency
        self.num_layers = num_layers
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.lr_schedule_factor = lr_schedule_factor
        self.optimizer = optimizer
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.eps = eps
        self.gamma = gamma
        self.mini_batch_size = mini_batch_size
        self.image_shape = image_shape
        self.augmentation_type = augmentation_type
        self.top_k = top_k
        self.kv_store = kv_store

    def create_model(self):
        """Return a :class:`~sagemaker.amazon.image_classification.ImageClassification` referencing the latest
        s3 model data produced by this Estimator."""
        return ImageClassificationModel(self.model_data, self.role,
                                        self.sagemaker_session)

    def hyperparameters(self):
        """Return the SageMaker hyperparameters for training this ImageClassification Estimator"""
        hp = dict()
        hp.update(super(ImageClassification, self).hyperparameters())
        return hp