def _get_data(feature, annotations):
    from PIL import Image as _PIL_Image

    rs = np.random.RandomState(1234)

    def from_pil_image(pil_img, image_format="png"):
        # The above didn't work, so as a temporary fix write to temp files
        if image_format == "raw":
            image = np.array(pil_img)
            FORMAT_RAW = 2
            return tc.Image(
                _image_data=image.tobytes(),
                _width=image.shape[1],
                _height=image.shape[0],
                _channels=image.shape[2],
                _format_enum=FORMAT_RAW,
                _image_data_size=image.size,
            )
        else:
            with tempfile.NamedTemporaryFile(mode="w+b",
                                             suffix="." + image_format) as f:
                pil_img.save(f, format=image_format)
                return tc.Image(f.name)

    num_examples = 100
    max_num_boxes_per_image = 10
    classes = _CLASSES
    images = []
    anns = []
    FORMATS = ["png", "jpeg", "raw"]
    for i in range(num_examples):
        # Randomly determine image size (should handle large and small)
        img_shape = tuple(rs.randint(100, 1000, size=2)) + (3, )
        img = rs.randint(255, size=img_shape)

        pil_img = _PIL_Image.fromarray(img, mode="RGB")
        # Randomly select image format
        image_format = FORMATS[rs.randint(len(FORMATS))]
        images.append(from_pil_image(pil_img, image_format=image_format))

        ann = []
        for j in range(rs.randint(max_num_boxes_per_image)):
            left, right = np.sort(rs.randint(0, img_shape[1], size=2))
            top, bottom = np.sort(rs.randint(0, img_shape[0], size=2))

            x = (left + right) / 2
            y = (top + bottom) / 2

            width = max(right - left, 1)
            height = max(bottom - top, 1)

            label = {
                "coordinates": {
                    "x": x,
                    "y": y,
                    "width": width,
                    "height": height,
                },
                "label": classes[rs.randint(len(classes))],
                "type": "rectangle",
            }
            ann.append(label)
        anns.append(ann)

    data = tc.SFrame({
        feature: tc.SArray(images),
        annotations: tc.SArray(anns),
    })
    return data
Пример #2
0
    def predict_topk(self,
                     dataset,
                     output_type='probability',
                     k=3,
                     verbose=True,
                     batch_size=64):
        """
        Return top-k predictions for the ``dataset``.
        Predictions are returned as an SFrame with three columns: `id`,
        `class`, and `probability` or `rank` depending on the ``output_type``
        parameter.

        Parameters
        ----------
        dataset : SFrame | SArray | dict
            The audio data to be classified.
            If dataset is an SFrame, it must have a column with the same name as
            the feature used for model training, but does not require a target
            column. Additional columns are ignored.

        output_type : {'probability', 'rank'}, optional
            Choose the return type of the prediction:
            - `probability`: Probability associated with each label in the prediction.
            - `rank`       : Rank associated with each label in the prediction.

        k : int, optional
            Number of classes to return for each input example.

        verbose : bool, optional
            If True, prints progress updates and model details.

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve performance.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions.

        See Also
        --------
        predict, classify, evaluate

        Examples
        --------
        >>> pred = m.predict_topk(validation_data, k=3)
        >>> pred
        +------+-------+-------------------+
        |  id  | class |    probability    |
        +------+-------+-------------------+
        |  0   |   4   |   0.995623886585  |
        |  0   |   9   |  0.0038311756216  |
        |  0   |   7   | 0.000301006948575 |
        |  1   |   1   |   0.928708016872  |
        |  1   |   3   |  0.0440889261663  |
        |  1   |   2   |  0.0176190119237  |
        |  2   |   3   |   0.996967732906  |
        |  2   |   2   |  0.00151345680933 |
        |  2   |   7   | 0.000637513934635 |
        |  3   |   1   |   0.998070061207  |
        | ...  |  ...  |        ...        |
        +------+-------+-------------------+
        """
        prob_vector = self.predict(dataset,
                                   output_type='probability_vector',
                                   verbose=verbose,
                                   batch_size=64)
        id_to_label = self._id_to_class_label

        if output_type == 'probability':
            results = prob_vector.apply(lambda p: [{
                'class': id_to_label[i],
                'probability': p[i]
            } for i in reversed(_np.argsort(p)[-k:])])
        else:
            assert (output_type == 'rank')
            results = prob_vector.apply(lambda p: [{
                'class': id_to_label[i],
                'rank': rank
            } for rank, i in enumerate(reversed(_np.argsort(p)[-k:]))])

        results = _tc.SFrame({'X': results})
        results = results.add_row_number()
        results = results.stack('X', new_column_name='X')
        results = results.unpack('X', column_name_prefix='')
        return results
Пример #3
0
def create(dataset,
           label=None,
           feature=None,
           model='resnet-50',
           verbose=True,
           batch_size=64):
    """
    Create a :class:`ImageSimilarityModel` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The column named by the 'feature' parameter will be
        extracted for modeling.

    label : string
        Name of the SFrame column with row labels to be used as uuid's to
        identify the data. If 'label' is set to None, row numbers are used to
        identify reference dataset rows when the model is queried.

    feature : string
        indicates that the SFrame has only column of Image type and that will
        Name of the column containing the input images. 'None' (the default)
        be used for similarity.

    model: string, optional
        Uses a pretrained model to bootstrap an image similarity model

           - "resnet-50" : Uses a pretrained resnet model.

           - "squeezenet_v1.1" : Uses a pretrained squeezenet model.

           - "VisionFeaturePrint_Screen": Uses an OS internal feature extractor.
                                          Only on available on iOS 12.0+,
                                          macOS 10.14+ and tvOS 12.0+.

        Models are downloaded from the internet if not available locally. Once
        downloaded, the models are cached for future use.

    verbose : bool, optional
        If True, print progress updates and model details.

    batch_size : int, optional
        If you are getting memory errors, try decreasing this value. If you
        have a powerful computer, increasing this value may improve performance.

    Returns
    -------
    out : ImageSimilarityModel
        A trained :class:`ImageSimilarityModel` model.

    See Also
    --------
    ImageSimilarityModel

    Examples
    --------
    .. sourcecode:: python

        # Train an image similarity model
        >>> model = turicreate.image_similarity.create(data)

        # Query the model for similar images
        >>> similar_images = model.query(data)
        +-------------+-----------------+-------------------+------+
        | query_label | reference_label |      distance     | rank |
        +-------------+-----------------+-------------------+------+
        |      0      |        0        |        0.0        |  1   |
        |      0      |       519       |   12.5319706301   |  2   |
        |      0      |       1619      |   12.5563764596   |  3   |
        |      0      |       186       |   12.6132604915   |  4   |
        |      0      |       1809      |   12.9180964745   |  5   |
        |      1      |        1        | 2.02304872852e-06 |  1   |
        |      1      |       1579      |   11.4288186151   |  2   |
        |      1      |       1237      |   12.3764325949   |  3   |
        |      1      |        80       |   12.7264363676   |  4   |
        |      1      |        58       |   12.7675058558   |  5   |
        +-------------+-----------------+-------------------+------+
        [500 rows x 4 columns]
    """
    start_time = _time.time()

    # Check parameters
    allowed_models = list(_pre_trained_models.MODELS.keys())
    if _mac_ver() >= (10, 14):
        allowed_models.append('VisionFeaturePrint_Screen')
    _tkutl._check_categorical_option_type('model', model, allowed_models)
    if len(dataset) == 0:
        raise _ToolkitError('Unable to train on empty dataset')
    if (label is not None) and (label not in dataset.column_names()):
        raise _ToolkitError("Row label column '%s' does not exist" % label)
    if (feature is not None) and (feature not in dataset.column_names()):
        raise _ToolkitError("Image feature column '%s' does not exist" %
                            feature)
    if (batch_size < 1):
        raise ValueError("'batch_size' must be greater than or equal to 1")

    # Set defaults
    if feature is None:
        feature = _tkutl._find_only_image_column(dataset)

    feature_extractor = _image_feature_extractor._create_feature_extractor(
        model)

    # Extract features
    extracted_features = _tc.SFrame({
        '__image_features__':
        feature_extractor.extract_features(dataset,
                                           feature,
                                           verbose=verbose,
                                           batch_size=batch_size),
    })

    # Train a similarity model using the extracted features
    if label is not None:
        extracted_features[label] = dataset[label]
    nn_model = _tc.nearest_neighbors.create(extracted_features,
                                            label=label,
                                            features=['__image_features__'],
                                            verbose=verbose)

    # set input image shape
    if model in _pre_trained_models.MODELS:
        input_image_shape = _pre_trained_models.MODELS[model].input_image_shape
    else:  # model == VisionFeaturePrint_Screen
        input_image_shape = (3, 299, 299)

    # Save the model
    state = {
        'similarity_model': nn_model,
        'model': model,
        'feature_extractor': feature_extractor,
        'input_image_shape': input_image_shape,
        'label': label,
        'feature': feature,
        'num_features': 1,
        'num_examples': nn_model.num_examples,
        'training_time': _time.time() - start_time,
    }
    return ImageSimilarityModel(state)
Пример #4
0
def create(
    dataset,
    features=None,
    distance=None,
    radius=1.0,
    min_core_neighbors=10,
    verbose=True,
):
    """
    Create a DBSCAN clustering model. The DBSCAN method partitions the input
    dataset into three types of points, based on the estimated probability
    density at each point.

    - **Core** points have a large number of points within a given neighborhood.
      Specifically, `min_core_neighbors` must be within distance `radius` of a
      point for it to be considered a core point.

    - **Boundary** points are within distance `radius` of a core point, but
      don't have sufficient neighbors of their own to be considered core.

    - **Noise** points comprise the remainder of the data. These points have too
      few neighbors to be considered core points, and are further than distance
      `radius` from all core points.

    Clusters are formed by connecting core points that are neighbors of each
    other, then assigning boundary points to their nearest core neighbor's
    cluster.

    Parameters
    ----------
    dataset : SFrame
        Training data, with each row corresponding to an observation. Must
        include all features specified in the `features` parameter, but may have
        additional columns as well.

    features : list[str], optional
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates that all columns of the input `dataset` should
        be used to train the model. All features must be numeric, i.e. integer
        or float types.

    distance : str or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of two types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', or 'transformed_dot_product'.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (str)

          2. standard distance name (str)

          3. scaling factor (int or float)

        For more information about Turi Create distance functions, please
        see the :py:mod:`~turicreate.toolkits.distances` module.

        For sparse vectors, missing keys are assumed to have value 0.0.

        If 'distance' is left unspecified, a composite distance is constructed
        automatically based on feature types.

    radius : int or float, optional
        Size of each point's neighborhood, with respect to the specified
        distance function.

    min_core_neighbors : int, optional
        Number of neighbors that must be within distance `radius` of a point in
        order for that point to be considered a "core point" of a cluster.

    verbose : bool, optional
        If True, print progress updates and model details during model creation.

    Returns
    -------
    out : DBSCANModel
        A model containing a cluster label for each row in the input `dataset`.
        Also contains the indices of the core points, cluster boundary points,
        and noise points.

    See Also
    --------
    DBSCANModel, turicreate.toolkits.distances

    Notes
    -----
    - Our implementation of DBSCAN first computes the similarity graph on the
      input dataset, which can be a computationally intensive process. In the
      current implementation, some distances are substantially faster than
      others; in particular "euclidean", "squared_euclidean", "cosine", and
      "transformed_dot_product" are quite fast, while composite distances can be
      slow.

    - Any distance function in the Turi Create library may be used with DBSCAN but
      the results may be poor for distances that violate the standard metric
      properties, i.e. symmetry, non-negativity, triangle inequality, and
      identity of indiscernibles. In particular, the DBSCAN algorithm is based
      on the concept of connecting high-density points that are *close* to each
      other into a single cluster, but the notion of *close* may be very
      counterintuitive if the chosen distance function is not a valid metric.
      The distances "euclidean", "manhattan", "jaccard", and "levenshtein" will
      likely yield the best results.

    References
    ----------
    - Ester, M., et al. (1996) `A Density-Based Algorithm for Discovering
      Clusters in Large Spatial Databases with Noise
      <https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf>`_. In Proceedings of the
      Second International Conference on Knowledge Discovery and Data Mining.
      pp. 226-231.

    - `Wikipedia - DBSCAN <https://en.wikipedia.org/wiki/DBSCAN>`_

    - `Visualizing DBSCAN Clustering
      <http://www.naftaliharris.com/blog/visualizing-dbscan-clustering/>`_

    Examples
    --------
    >>> sf = turicreate.SFrame({
    ...     'x1': [0.6777, -9.391, 7.0385, 2.2657, 7.7864, -10.16, -8.162,
    ...            8.8817, -9.525, -9.153, 2.0860, 7.6619, 6.5511, 2.7020],
    ...     'x2': [5.6110, 8.5139, 5.3913, 5.4743, 8.3606, 7.8843, 2.7305,
    ...            5.1679, 6.7231, 3.7051, 1.7682, 7.4608, 3.1270, 6.5624]})
    ...
    >>> model = turicreate.dbscan.create(sf, radius=4.25, min_core_neighbors=3)
    >>> model.cluster_id.print_rows(15)
    +--------+------------+----------+
    | row_id | cluster_id |   type   |
    +--------+------------+----------+
    |   8    |     0      |   core   |
    |   7    |     2      |   core   |
    |   0    |     1      |   core   |
    |   2    |     2      |   core   |
    |   3    |     1      |   core   |
    |   11   |     2      |   core   |
    |   4    |     2      |   core   |
    |   1    |     0      | boundary |
    |   6    |     0      | boundary |
    |   5    |     0      | boundary |
    |   9    |     0      | boundary |
    |   12   |     2      | boundary |
    |   10   |     1      | boundary |
    |   13   |     1      | boundary |
    +--------+------------+----------+
    [14 rows x 3 columns]
    """
    ## Start the training time clock and instantiate an empty model
    logger = _logging.getLogger(__name__)
    start_time = _time.time()

    ## Validate the input dataset
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

    ## Validate neighborhood parameters
    if not isinstance(min_core_neighbors, int) or min_core_neighbors < 0:
        raise ValueError("Input 'min_core_neighbors' must be a non-negative " +
                         "integer.")

    if not isinstance(radius, (int, float)) or radius < 0:
        raise ValueError("Input 'radius' must be a non-negative integer " +
                         "or float.")

    ## Compute all-point nearest neighbors within `radius` and count
    #  neighborhood sizes
    knn_model = _tc.nearest_neighbors.create(
        dataset,
        features=features,
        distance=distance,
        method="brute_force",
        verbose=verbose,
    )

    knn = knn_model.similarity_graph(
        k=None,
        radius=radius,
        include_self_edges=False,
        output_type="SFrame",
        verbose=verbose,
    )

    neighbor_counts = knn.groupby("query_label", _agg.COUNT)

    ### NOTE: points with NO neighbors are already dropped here!

    ## Identify core points and boundary candidate points. Not all of the
    #  boundary candidates will be boundary points - some are in small isolated
    #  clusters.
    if verbose:
        logger.info("Identifying noise points and core points.")

    boundary_mask = neighbor_counts["Count"] < min_core_neighbors
    core_mask = 1 - boundary_mask

    # this includes too small clusters
    boundary_idx = neighbor_counts[boundary_mask]["query_label"]
    core_idx = neighbor_counts[core_mask]["query_label"]

    ## Build a similarity graph on the core points
    ## NOTE: careful with singleton core points - the second filter removes them
    #  from the edge set so they have to be added separately as vertices.
    if verbose:
        logger.info("Constructing the core point similarity graph.")

    core_vertices = knn.filter_by(core_idx, "query_label")
    core_edges = core_vertices.filter_by(core_idx, "reference_label")

    core_graph = _tc.SGraph()
    core_graph = core_graph.add_vertices(core_vertices[["query_label"]],
                                         vid_field="query_label")
    core_graph = core_graph.add_edges(core_edges,
                                      src_field="query_label",
                                      dst_field="reference_label")

    ## Compute core point connected components and relabel to be consecutive
    #  integers
    cc = _tc.connected_components.create(core_graph, verbose=verbose)
    cc_labels = cc.component_size.add_row_number("__label")
    core_assignments = cc.component_id.join(cc_labels,
                                            on="component_id",
                                            how="left")[["__id", "__label"]]
    core_assignments["type"] = "core"

    ## Join potential boundary points to core cluster labels (points that aren't
    #  really on a boundary are implicitly dropped)
    if verbose:
        logger.info("Processing boundary points.")

    boundary_edges = knn.filter_by(boundary_idx, "query_label")

    # separate real boundary points from points in small isolated clusters
    boundary_core_edges = boundary_edges.filter_by(core_idx, "reference_label")

    # join a boundary point to its single closest core point.
    boundary_assignments = boundary_core_edges.groupby(
        "query_label",
        {"reference_label": _agg.ARGMIN("rank", "reference_label")})

    boundary_assignments = boundary_assignments.join(
        core_assignments, on={"reference_label": "__id"})

    boundary_assignments = boundary_assignments.rename({"query_label": "__id"},
                                                       inplace=True)
    boundary_assignments = boundary_assignments.remove_column(
        "reference_label", inplace=True)
    boundary_assignments["type"] = "boundary"

    ## Identify boundary candidates that turned out to be in small clusters but
    #  not on real cluster boundaries
    small_cluster_idx = set(boundary_idx).difference(
        boundary_assignments["__id"])

    ## Identify individual noise points by the fact that they have no neighbors.
    noise_idx = set(range(dataset.num_rows())).difference(
        neighbor_counts["query_label"])

    noise_idx = noise_idx.union(small_cluster_idx)

    noise_assignments = _tc.SFrame(
        {"row_id": _tc.SArray(list(noise_idx), int)})
    noise_assignments["cluster_id"] = None
    noise_assignments["cluster_id"] = noise_assignments["cluster_id"].astype(
        int)
    noise_assignments["type"] = "noise"

    ## Append core, boundary, and noise results to each other.
    master_assignments = _tc.SFrame()
    num_clusters = 0

    if core_assignments.num_rows() > 0:
        core_assignments = core_assignments.rename(
            {
                "__id": "row_id",
                "__label": "cluster_id"
            }, inplace=True)
        master_assignments = master_assignments.append(core_assignments)
        num_clusters = len(core_assignments["cluster_id"].unique())

    if boundary_assignments.num_rows() > 0:
        boundary_assignments = boundary_assignments.rename(
            {
                "__id": "row_id",
                "__label": "cluster_id"
            }, inplace=True)
        master_assignments = master_assignments.append(boundary_assignments)

    if noise_assignments.num_rows() > 0:
        master_assignments = master_assignments.append(noise_assignments)

    ## Post-processing and formatting
    state = {
        "verbose": verbose,
        "radius": radius,
        "min_core_neighbors": min_core_neighbors,
        "distance": knn_model.distance,
        "num_distance_components": knn_model.num_distance_components,
        "num_examples": dataset.num_rows(),
        "features": knn_model.features,
        "num_features": knn_model.num_features,
        "unpacked_features": knn_model.unpacked_features,
        "num_unpacked_features": knn_model.num_unpacked_features,
        "cluster_id": master_assignments,
        "num_clusters": num_clusters,
        "training_time": _time.time() - start_time,
    }

    return DBSCANModel(state)
Пример #5
0
def create(dataset,
           target,
           feature,
           max_iterations=10,
           custom_layer_sizes=[100, 100],
           verbose=True,
           validation_set='auto',
           batch_size=64):
    '''
    Creates a :class:`SoundClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The column named by the 'feature' parameter will be
        extracted for modeling.

    target : string or int
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.

    feature : string, optional
        Name of the column containing the feature column. This column must
        contain audio data or deep audio features.
        Audio data is represented as dicts with key 'data' and 'sample_rate',
        see `turicreate.load_audio(...)`.
        Deep audio features are represented as a list of numpy arrays, each of
        size 12288, see `turicreate.sound_classifier.get_deep_features(...)`.

    max_iterations : int, optional
        The maximum number of allowed passes through the data. More passes over
        the data can result in a more accurately trained model. Consider
        increasing this (the default value is 10) if the training accuracy is
        low.

    custom_layer_sizes : list of ints
        Specifies the architecture of the custom neural network. This neural
        network is made up of a series of dense layers. This parameter allows
        you to specify how many layers and the number of units in each layer.
        The custom neural network will always have one more layer than the
        length of this list. The last layer is always a soft max with units
        equal to the number of classes.

    verbose : bool, optional
        If True, prints progress updates and model details.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance. The
        format of this SFrame must be the same as the training dataset. By
        default, a validation set is automatically sampled. If `validation_set`
        is set to None, no validataion is used. You can also pass a validation
        set you have constructed yourself.

    batch_size : int, optional
        If you are getting memory errors, try decreasing this value. If you
        have a powerful computer, increasing this value may improve performance.
    '''
    import time
    from .._mxnet import _mxnet_utils
    import mxnet as mx

    from ._audio_feature_extractor import _get_feature_extractor

    start_time = time.time()

    # check parameters
    if len(dataset) == 0:
        raise _ToolkitError('Unable to train on empty dataset')
    if feature not in dataset.column_names():
        raise _ToolkitError("Audio feature column '%s' does not exist" %
                            feature)
    if not _is_deep_feature_sarray(
            dataset[feature]) and not _is_audio_data_sarray(dataset[feature]):
        raise _ToolkitError("'%s' column is not audio data." % feature)
    if target not in dataset.column_names():
        raise _ToolkitError("Target column '%s' does not exist" % target)
    if not _tc.util._is_non_string_iterable(custom_layer_sizes) or len(
            custom_layer_sizes) == 0:
        raise _ToolkitError("'custom_layer_sizes' must be a non-empty list.")
    for i in custom_layer_sizes:
        if not isinstance(i, int):
            raise _ToolkitError(
                "'custom_layer_sizes' must contain only integers.")
    if not (isinstance(validation_set, _tc.SFrame) or validation_set == 'auto'
            or validation_set is None):
        raise TypeError("Unrecognized value for 'validation_set'")
    if isinstance(validation_set, _tc.SFrame):
        if feature not in validation_set.column_names(
        ) or target not in validation_set.column_names():
            raise ValueError(
                "The 'validation_set' SFrame must be in the same format as the 'dataset'"
            )
    if batch_size < 1:
        raise ValueError('\'batch_size\' must be greater than or equal to 1')

    classes = list(dataset[target].unique().sort())
    num_labels = len(classes)
    if num_labels <= 1:
        raise ValueError('The number of classes must be greater than one.')
    feature_extractor_name = 'VGGish'
    feature_extractor = _get_feature_extractor(feature_extractor_name)
    class_label_to_id = {l: i for i, l in enumerate(classes)}

    # create the validation set
    if not isinstance(validation_set, _tc.SFrame) and validation_set == 'auto':
        if len(dataset) >= 100:
            print(
                "Creating a validation set from 5 percent of training data. This may take a while.\n"
                "\tYou can set ``validation_set=None`` to disable validation tracking.\n"
            )
            dataset, validation_set = dataset.random_split(0.95, exact=True)
        else:
            validation_set = None

    encoded_target = dataset[target].apply(lambda x: class_label_to_id[x])

    if _is_deep_feature_sarray(dataset[feature]):
        train_deep_features = dataset[feature]
    else:
        # do the preprocess and VGGish feature extraction
        train_deep_features = get_deep_features(dataset[feature],
                                                verbose=verbose)

    train_data = _tc.SFrame({
        'deep features': train_deep_features,
        'labels': encoded_target
    })
    train_data = train_data.stack('deep features',
                                  new_column_name='deep features')
    train_data, missing_ids = train_data.dropna_split(
        columns=['deep features'])

    if len(missing_ids) > 0:
        _logging.warning(
            "Dropping %d examples which are less than 975ms in length." %
            len(missing_ids))

    if validation_set is not None:
        if verbose:
            print("Preparing validataion set")
        validation_encoded_target = validation_set[target].apply(
            lambda x: class_label_to_id[x])

        if _is_deep_feature_sarray(validation_set[feature]):
            validation_deep_features = validation_set[feature]
        else:
            validation_deep_features = get_deep_features(
                validation_set[feature], verbose=verbose)

        validation_data = _tc.SFrame({
            'deep features': validation_deep_features,
            'labels': validation_encoded_target
        })
        validation_data = validation_data.stack(
            'deep features', new_column_name='deep features')
        validation_data = validation_data.dropna(columns=['deep features'])

        validation_batch_size = min(len(validation_data), batch_size)
        validation_data = mx.io.NDArrayIter(
            validation_data['deep features'].to_numpy(),
            label=validation_data['labels'].to_numpy(),
            batch_size=validation_batch_size)
    else:
        validation_data = []

    if verbose:
        print("\nTraining a custom neural network -")

    training_batch_size = min(len(train_data), batch_size)
    train_data = mx.io.NDArrayIter(train_data['deep features'].to_numpy(),
                                   label=train_data['labels'].to_numpy(),
                                   batch_size=training_batch_size,
                                   shuffle=True)

    custom_NN = SoundClassifier._build_custom_neural_network(
        feature_extractor.output_length, num_labels, custom_layer_sizes)
    ctx = _mxnet_utils.get_mxnet_context()
    custom_NN.initialize(mx.init.Xavier(), ctx=ctx)

    trainer = mx.gluon.Trainer(custom_NN.collect_params(), 'nag', {
        'learning_rate': 0.01,
        'momentum': 0.9
    })

    if verbose:
        # Setup progress table
        row_ids = ['iteration', 'train_accuracy', 'time']
        row_display_names = ['Iteration', 'Training Accuracy', 'Elapsed Time']
        if validation_data:
            row_ids.insert(2, 'validation_accuracy')
            row_display_names.insert(2, 'Validation Accuracy (%)')
        table_printer = _tc.util._ProgressTablePrinter(row_ids,
                                                       row_display_names)

    train_metric = mx.metric.Accuracy()
    if validation_data:
        validation_metric = mx.metric.Accuracy()
    softmax_cross_entropy_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss()
    for i in range(max_iterations):
        # TODO: early stopping

        for batch in train_data:
            data = mx.gluon.utils.split_and_load(batch.data[0],
                                                 ctx_list=ctx,
                                                 batch_axis=0,
                                                 even_split=False)
            label = mx.gluon.utils.split_and_load(batch.label[0],
                                                  ctx_list=ctx,
                                                  batch_axis=0,
                                                  even_split=False)

            # Inside training scope
            with mx.autograd.record():
                for x, y in zip(data, label):
                    z = custom_NN(x)
                    # Computes softmax cross entropy loss.
                    loss = softmax_cross_entropy_loss(z, y)
                    # Backpropagate the error for one iteration.
                    loss.backward()
            # Make one step of parameter update. Trainer needs to know the
            # batch size of data to normalize the gradient by 1/batch_size.
            trainer.step(batch.data[0].shape[0])
        train_data.reset()

        # Calculate training metric
        for batch in train_data:
            data = mx.gluon.utils.split_and_load(batch.data[0],
                                                 ctx_list=ctx,
                                                 batch_axis=0,
                                                 even_split=False)
            label = mx.gluon.utils.split_and_load(batch.label[0],
                                                  ctx_list=ctx,
                                                  batch_axis=0,
                                                  even_split=False)
            outputs = [custom_NN(x) for x in data]
            train_metric.update(label, outputs)
        train_data.reset()

        # Calculate validataion metric
        for batch in validation_data:
            data = mx.gluon.utils.split_and_load(batch.data[0],
                                                 ctx_list=ctx,
                                                 batch_axis=0,
                                                 even_split=False)
            label = mx.gluon.utils.split_and_load(batch.label[0],
                                                  ctx_list=ctx,
                                                  batch_axis=0,
                                                  even_split=False)
            outputs = [custom_NN(x) for x in data]
            validation_metric.update(label, outputs)

        # Get metrics, print progress table
        _, train_accuracy = train_metric.get()
        train_metric.reset()
        printed_row_values = {
            'iteration': i + 1,
            'train_accuracy': train_accuracy
        }
        if validation_data:
            _, validataion_accuracy = validation_metric.get()
            printed_row_values['validation_accuracy'] = validataion_accuracy
            validation_metric.reset()
            validation_data.reset()
        if verbose:
            printed_row_values['time'] = time.time() - start_time
            table_printer.print_row(**printed_row_values)

    state = {
        '_class_label_to_id': class_label_to_id,
        '_custom_classifier': custom_NN,
        '_feature_extractor': feature_extractor,
        '_id_to_class_label': {v: k
                               for k, v in class_label_to_id.items()},
        'classes': classes,
        'custom_layer_sizes': custom_layer_sizes,
        'feature': feature,
        'feature_extractor_name': feature_extractor.name,
        'num_classes': num_labels,
        'num_examples': len(dataset),
        'target': target,
        'training_accuracy': train_accuracy,
        'training_time': time.time() - start_time,
        'validation_accuracy':
        validataion_accuracy if validation_data else None,
    }
    return SoundClassifier(state)
 def test_categorical_2(self):
     sf = tc.SFrame({
         'cat[1]': ['1', '1', '2', '2', '2'] * 100,
         'cat[2]': ['1', '3', '3', '1', '1'] * 100
         })
     self._run_test(sf, 4)
 def setUpClass(self):
     np.random.seed(37)
     self.n = 30
     self.sf = tc.SFrame(np.random.rand(self.n, 2))
Пример #8
0
        def get_confusion_matrix(extended_test, labels):
            #Init a matrix
            sf_confusion_matrix = {
                'label': [],
                'predicted_label': [],
                'prob_default': []
            }
            for target_l in labels:
                for predicted_l in labels:
                    sf_confusion_matrix['label'].append(target_l)
                    sf_confusion_matrix['predicted_label'].append(predicted_l)
                    sf_confusion_matrix['prob_default'].append(0)

            sf_confusion_matrix = _tc.SFrame(sf_confusion_matrix)
            sf_confusion_matrix = sf_confusion_matrix.join(
                extended_test.groupby(['label', 'predicted_label'],
                                      {'count': _tc.aggregate.COUNT}),
                how='left',
                on=['label', 'predicted_label'])
            sf_confusion_matrix = sf_confusion_matrix.fillna('count', 0)

            label_column = _tc.SFrame({'label': extended_test['label']})
            predictions = extended_test['probs']
            for i in range(0, len(labels)):
                new_test_data = label_column.add_columns([
                    predictions.apply(lambda probs: probs[i]),
                    predictions.apply(lambda probs: labels[i])
                ], ['prob', 'predicted_label'])
                if (i == 0):
                    test_longer_form = new_test_data
                else:
                    test_longer_form = test_longer_form.append(new_test_data)

            if len(extended_test) is 0:
                sf_confusion_matrix = sf_confusion_matrix.rename({
                    'prob_default':
                    'prob',
                    'label':
                    'target_label'
                })
            else:
                sf_confusion_matrix = sf_confusion_matrix.join(
                    test_longer_form.groupby(
                        ['label', 'predicted_label'],
                        {'prob': _tc.aggregate.SUM('prob')}),
                    how='left',
                    on=['label', 'predicted_label'])
                sf_confusion_matrix = sf_confusion_matrix.rename({
                    'label':
                    'target_label'
                }).fillna('prob', 0)

            def wo_divide_by_zero(a, b):
                if b == 0:
                    return None
                else:
                    return a * 1.0 / b

            sf_confusion_matrix['norm_prob'] = sf_confusion_matrix.join(
                sf_confusion_matrix.groupby(
                    'target_label', {'sum_prob': _tc.aggregate.SUM('prob')}),
                how='left').apply(
                    lambda x: wo_divide_by_zero(x['prob'], x['sum_prob']))
            return sf_confusion_matrix.fillna('norm_prob', 0)
Пример #9
0
    def test_categorical(self):
        # Arrange
        sf = tc.SFrame({
            'cat1': ['1', '1', '2', '2', '2'] * 100,
            'cat2': ['1', '3', '3', '1', '1'] * 100,
            'target': ['1', '2', '1', '2', '1'] * 100,
        })

        # Act
        tree = _make_tree(sf)
        root = tree.root

        # Check the root node.
        self.assertEquals(len(tree.nodes), 7)
        self.assertEquals(
            root.to_dict(), {
                'is_leaf': False,
                'left_id': 2,
                'node_id': 0,
                'missing_id': 1,
                'node_type': u'indicator',
                'parent_id': None,
                'right_id': 1,
                'split_feature_column': 'cat1',
                'split_feature_index': '1',
                'value': 1
            })

        # Check prediction paths.
        self.assertEquals(tree.get_prediction_path(0), [])
        self.assertEquals(tree.get_prediction_path(1), [{
            'child_id': 1,
            'feature': 'cat1',
            'index': '1',
            'node_type': 'indicator',
            'node_id': 0,
            'sign': '!=',
            'value': 1,
            'is_missing': False
        }])
        self.assertEquals(tree.get_prediction_path(2), [{
            'child_id': 2,
            'feature': 'cat1',
            'index': '1',
            'node_id': 0,
            'sign': '=',
            'value': 1,
            'node_type': 'indicator',
            'is_missing': False
        }])
        self.assertEquals(tree.get_prediction_path(3), [{
            'child_id': 1,
            'feature': 'cat1',
            'index': '1',
            'node_id': 0,
            'sign': '!=',
            'value': 1,
            'node_type': 'indicator',
            'is_missing': False
        }, {
            'child_id': 3,
            'feature': 'cat2',
            'index': '1',
            'node_id': 1,
            'sign': '!=',
            'value': 1,
            'node_type': 'indicator',
            'is_missing': False
        }])
        self.assertEquals(tree.get_prediction_path(4), [{
            'child_id': 1,
            'feature': 'cat1',
            'index': '1',
            'node_id': 0,
            'sign': '!=',
            'value': 1,
            'node_type': 'indicator',
            'is_missing': False
        }, {
            'child_id': 4,
            'feature': 'cat2',
            'index': '1',
            'node_id': 1,
            'sign': '=',
            'value': 1,
            'node_type': 'indicator',
            'is_missing': False
        }])
        self.assertEquals(tree.get_prediction_path(5), [{
            'child_id': 2,
            'feature': 'cat1',
            'index': '1',
            'node_id': 0,
            'sign': '=',
            'value': 1,
            'node_type': 'indicator',
            'is_missing': False
        }, {
            'child_id': 5,
            'feature': 'cat2',
            'index': '1',
            'node_id': 2,
            'sign': '!=',
            'value': 1,
            'node_type': 'indicator',
            'is_missing': False
        }])
        self.assertEquals(tree.get_prediction_path(6), [{
            'child_id': 2,
            'feature': 'cat1',
            'index': '1',
            'node_id': 0,
            'sign': '=',
            'value': 1,
            'node_type': 'indicator',
            'is_missing': False
        }, {
            'child_id': 6,
            'feature': 'cat2',
            'index': '1',
            'node_id': 2,
            'sign': '=',
            'value': 1,
            'node_type': 'indicator',
            'is_missing': False
        }])
class MyfirstappConfig(AppConfig):
    name = 'myFirstApp'
    #loading ML model
    test_keys_model = tc.load_model('/home/supriy/myENV/venv/DjangoProject/src/myFirstApp/models/keys_model')
    #loading movie data
    movie_sframe = tc.SFrame('/home/supriy/myENV/venv/DjangoProject/src/myFirstApp/models/final_django_sframe')
Пример #11
0
def create(
        dataset,
        target,
        feature=None,
        model='resnet-50',
        l2_penalty=0.01,
        l1_penalty=0.0,
        solver='auto',
        feature_rescaling=True,
        convergence_threshold=_DEFAULT_SOLVER_OPTIONS['convergence_threshold'],
        step_size=_DEFAULT_SOLVER_OPTIONS['step_size'],
        lbfgs_memory_level=_DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'],
        max_iterations=_DEFAULT_SOLVER_OPTIONS['max_iterations'],
        class_weights=None,
        validation_set='auto',
        verbose=True,
        seed=None,
        batch_size=64):
    """
    Create a :class:`ImageClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The column named by the 'feature' parameter will be
        extracted for modeling.

    target : string, or int
        Name of the column containing the target variable. The values in this
        column must be of string or integer type. String target variables are
        automatically mapped to integers in the order in which they are provided.
        For example, a target variable with 'cat' and 'dog' as possible
        values is mapped to 0 and 1 respectively with 0 being the base class
        and 1 being the reference class. Use `model.classes` to retrieve
        the order in which the classes are mapped.

    feature : string, optional
        indicates that the SFrame has only column of Image type and that will
        Name of the column containing the input images. 'None' (the default)
        indicates the only image column in `dataset` should be used as the
        feature.
        
    l2_penalty : float, optional
        Weight on l2 regularization of the model. The larger this weight, the
        more the model coefficients shrink toward 0. This introduces bias into
        the model but decreases variance, potentially leading to better
        predictions. The default value is 0.01; setting this parameter to 0
        corresponds to unregularized logistic regression. See the ridge
        regression reference for more detail.

    l1_penalty : float, optional
        Weight on l1 regularization of the model. Like the l2 penalty, the
        higher the l1 penalty, the more the estimated coefficients shrink toward
        0. The l1 penalty, however, completely zeros out sufficiently small
        coefficients, automatically indicating features that are not useful
        for the model. The default weight of 0 prevents any features from
        being discarded. See the LASSO regression reference for more detail.

    solver : string, optional
        Name of the solver to be used to solve the regression. See the
        references for more detail on each solver. Available solvers are:

        - *auto (default)*: automatically chooses the best solver for the data
          and model parameters.
        - *newton*: Newton-Raphson
        - *lbfgs*: limited memory BFGS
        - *fista*: accelerated gradient descent

        For this model, the Newton-Raphson method is equivalent to the
        iteratively re-weighted least squares algorithm. If the l1_penalty is
        greater than 0, use the 'fista' solver.

        The model is trained using a carefully engineered collection of methods
        that are automatically picked based on the input data. The ``newton``
        method  works best for datasets with plenty of examples and few features
        (long datasets). Limited memory BFGS (``lbfgs``) is a robust solver for
        wide datasets (i.e datasets with many coefficients).  ``fista`` is the
        default solver for l1-regularized linear regression. The solvers are all
        automatically tuned and the default options should function well. See
        the solver options guide for setting additional parameters for each of
        the solvers.

        See the user guide for additional details on how the solver is chosen.
        (see `here
        <https://apple.github.io/turicreate/docs/userguide/supervised-learning/linear-regression.html>`_)

    feature_rescaling : boolean, optional
        Feature rescaling is an important pre-processing step that ensures that
        all features are on the same scale. An l2-norm rescaling is performed
        to make sure that all features are of the same norm. Categorical
        features are also rescaled by rescaling the dummy variables that are
        used to represent them. The coefficients are returned in original scale
        of the problem. This process is particularly useful when features
        vary widely in their ranges.

    convergence_threshold : float, optional
        Convergence is tested using variation in the training objective. The
        variation in the training objective is calculated using the difference
        between the objective values between two steps. Consider reducing this
        below the default value (0.01) for a more accurately trained model.
        Beware of overfitting (i.e a model that works well only on the training
        data) if this parameter is set to a very low value.

    lbfgs_memory_level : float, optional
        The L-BFGS algorithm keeps track of gradient information from the
        previous ``lbfgs_memory_level`` iterations. The storage requirement for
        each of these gradients is the ``num_coefficients`` in the problem.
        Increasing the ``lbfgs_memory_level ``can help improve the quality of
        the model trained. Setting this to more than ``max_iterations`` has the
        same effect as setting it to ``max_iterations``.

    model : string optional
        Uses a pretrained model to bootstrap an image classifier:

           - "resnet-50" : Uses a pretrained resnet model.
                           Exported Core ML model will be ~90M.

           - "squeezenet_v1.1" : Uses a pretrained squeezenet model.
                                 Exported Core ML model will be ~4.7M.

           - "VisionFeaturePrint_Scene": Uses an OS internal feature extractor.
                                          Only on available on iOS 12.0+,
                                          macOS 10.14+ and tvOS 12.0+.
                                          Exported Core ML model will be ~41K.

        Models are downloaded from the internet if not available locally. Once
        downloaded, the models are cached for future use.
        
    step_size : float, optional
        The starting step size to use for the ``fista`` solver. The default is
        set to 1.0, this is an aggressive setting. If the first iteration takes
        a considerable amount of time, reducing this parameter may speed up
        model training.

    class_weights : {dict, `auto`}, optional
        Weights the examples in the training data according to the given class
        weights. If set to `None`, all classes are supposed to have weight one. The
        `auto` mode set the class weight to be inversely proportional to number of
        examples in the training data with the given class.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        The format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    max_iterations : int, optional
        The maximum number of allowed passes through the data. More passes over
        the data can result in a more accurately trained model. Consider
        increasing this (the default value is 10) if the training accuracy is
        low and the *Grad-Norm* in the display is large.

    verbose : bool, optional
        If True, prints progress updates and model details.

    seed : int, optional
        Seed for random number generation. Set this value to ensure that the
        same model is created every time.

    batch_size : int, optional
        If you are getting memory errors, try decreasing this value. If you
        have a powerful computer, increasing this value may improve performance.

    Returns
    -------
    out : ImageClassifier
        A trained :class:`ImageClassifier` model.

    Examples
    --------
    .. sourcecode:: python

        >>> model = turicreate.image_classifier.create(data, target='is_expensive')

        # Make predictions (in various forms)
        >>> predictions = model.predict(data)      # predictions
        >>> predictions = model.classify(data)     # predictions with confidence
        >>> predictions = model.predict_topk(data) # Top-5 predictions (multiclass)

        # Evaluate the model with ground truth data
        >>> results = model.evaluate(data)

    See Also
    --------
    ImageClassifier
    """
    start_time = _time.time()

    # Check model parameter
    allowed_models = list(_pre_trained_models.MODELS.keys())
    if _mac_ver() >= (10, 14):
        allowed_models.append('VisionFeaturePrint_Scene')

        # Also, to make sure existing code doesn't break, replace incorrect name
        # with the correct name version
        if model == "VisionFeaturePrint_Screen":
            print(
                "WARNING: Correct spelling of model name is VisionFeaturePrint_Scene; VisionFeaturePrint_Screen will be removed in subsequent versions."
            )
            model = "VisionFeaturePrint_Scene"

    _tkutl._check_categorical_option_type('model', model, allowed_models)

    # Check dataset parameter
    if len(dataset) == 0:
        raise _ToolkitError('Unable to train on empty dataset')
    if (feature is not None) and (feature not in dataset.column_names()):
        raise _ToolkitError("Image feature column '%s' does not exist" %
                            feature)
    if target not in dataset.column_names():
        raise _ToolkitError("Target column '%s' does not exist" % target)

    if (batch_size < 1):
        raise ValueError("'batch_size' must be greater than or equal to 1")

    if not (isinstance(validation_set, _tc.SFrame) or validation_set == 'auto'
            or validation_set is None):
        raise TypeError("Unrecognized value for 'validation_set'.")

    if feature is None:
        feature = _tkutl._find_only_image_column(dataset)

    feature_extractor = _image_feature_extractor._create_feature_extractor(
        model)

    # Extract features
    extracted_features = _tc.SFrame({
        target:
        dataset[target],
        '__image_features__':
        feature_extractor.extract_features(dataset,
                                           feature,
                                           verbose=verbose,
                                           batch_size=batch_size),
    })
    if isinstance(validation_set, _tc.SFrame):
        extracted_features_validation = _tc.SFrame({
            target:
            validation_set[target],
            '__image_features__':
            feature_extractor.extract_features(validation_set,
                                               feature,
                                               verbose=verbose,
                                               batch_size=batch_size),
        })
    else:
        extracted_features_validation = validation_set

    # Train a classifier using the extracted features
    extracted_features[target] = dataset[target]
    lr_model = _tc.logistic_classifier.create(
        extracted_features,
        features=['__image_features__'],
        target=target,
        max_iterations=max_iterations,
        validation_set=extracted_features_validation,
        seed=seed,
        verbose=verbose,
        l2_penalty=l2_penalty,
        l1_penalty=l1_penalty,
        solver=solver,
        feature_rescaling=feature_rescaling,
        convergence_threshold=convergence_threshold,
        step_size=step_size,
        lbfgs_memory_level=lbfgs_memory_level,
        class_weights=class_weights)

    # set input image shape
    if model in _pre_trained_models.MODELS:
        input_image_shape = _pre_trained_models.MODELS[model].input_image_shape
    else:  # model == VisionFeaturePrint_Scene
        input_image_shape = (3, 299, 299)

    # Save the model
    state = {
        'classifier': lr_model,
        'model': model,
        'max_iterations': max_iterations,
        'feature_extractor': feature_extractor,
        'input_image_shape': input_image_shape,
        'target': target,
        'feature': feature,
        'num_features': 1,
        'num_classes': lr_model.num_classes,
        'classes': lr_model.classes,
        'num_examples': lr_model.num_examples,
        'training_time': _time.time() - start_time,
        'training_loss': lr_model.training_loss,
    }
    return ImageClassifier(state)
Пример #12
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May  6 15:02:14 2020

@author: sandeepkompella
"""

import turicreate as tc

song_data = tc.SFrame('song_data')
print(song_data.column_names())
print(len(song_data))
#get the total no of unique users and thier count, use the len function to find the count
#print(song_data['user_id'].unique())
print(len(song_data['user_id'].unique()))

train_data, test_data = song_data.random_split(0.8,seed=0)
# create a song recommender using the simple popularity model
popularity_model = tc.popularity_recommender.create(train_data,user_id='user_id',item_id='song')
#make some predictions now for user 0 and 1
#print(popularity_model.recommend(users = [song_data['user_id'][0]]))
#print(popularity_model.recommend(users = [song_data['user_id'][1]]))
# create a song recommender using the personalization
personalized_model = tc.item_similarity_recommender.create(train_data, user_id='user_id',item_id='song')
print(personalized_model.recommend(users = [song_data['user_id'][0]]))
print(personalized_model.recommend(users = [song_data['user_id'][1]]))
#create a song similar one to with or Without you using the personalization
print(personalized_model.get_similar_items(['With Or Without You - U2']))
print(personalized_model.get_similar_items(['Chan Chan (Live) - Buena Vista Social Club']))
# Refrence the dataset path
url = "/Users/ahmedbekhit/Documents/Data/Development/TuriCreate/repo/turicreate-notebook/notebooks/data/food_images"

# Label the dataset
## Load the dataset folder image content using the image_analysis property
data = turi.image_analysis.load_images(url)
## Create a "foodType" key for each image in the dataset to specify whether it's an Egg or Soup, based on which folder it's located in.
data["foodType"] = data["path"].apply(lambda path: "Eggs"
                                      if "eggs" in path else "Soup")
## Export the labeled images as an SFrame object in order to use it while creating our image classifier.
data.save("egg_or_soup.sframe")
## Visualize the new labeled images list.
data.explore()

# Load the SFrame object that contains the labeled images.
dataBuffer = turi.SFrame("egg_or_soup.sframe")

# Randmly split the SFrame object
'''
    90% of the data from the original SFrame object will be used to train the image classifier.
    10% of the data from the original SFrame object will be used to test the image classifier.
'''
trainingBuffers, testingBuffers = dataBuffer.random_split(0.9)

# Train the image classifier using the SqueezeNet architecture and pre-trained model.
model = turi.image_classifier.create(trainingBuffers,
                                     target="foodType",
                                     model="squeezenet_v1.1")

# Evaluate the test data to determine the model accuracy
evaluations = model.evaluate(testingBuffers)
Пример #14
0
    def test_label_propagation(self):
        if "label_propagation" in get_unity().list_toolkit_functions():
            g = self.graph.copy()
            num_vertices = len(g.vertices)
            num_classes = 2

            def get_label(vid):
                if vid < 100:
                    return 0
                elif vid > num_vertices - 100:
                    return 1
                else:
                    return None

            g.vertices['label'] = g.vertices['__id'].apply(get_label, int)
            m = tc.label_propagation.create(g, label_field='label')

            m.summary()
            self.__test_model_save_load_helper__(m)

            for row in m.graph.vertices:
                predicted_label = row['predicted_label']
                if predicted_label is None:
                    for k in ['P%d' % i for i in range(num_classes)]:
                        self.assertAlmostEqual(row[k], 1.0 / num_classes)
                else:
                    sum_of_prob = 0.0
                    for k in ['P%d' % i for i in range(num_classes)]:
                        sum_of_prob += row[k]
                        self.assertGreaterEqual(row['P%d' % predicted_label],
                                                row[k])
                    self.assertAlmostEqual(sum_of_prob, 1.0)

            # Add more options: weighted edges, change self weight, and undirected edges
            def get_edge_weight(vid):
                return float(vid) * 10 / num_vertices

            g.edges['weight'] = g.edges['__src_id'].apply(
                get_edge_weight, float)
            m = tc.label_propagation.create(g,
                                            label_field='label',
                                            threshold=1e-2,
                                            weight_field='weight',
                                            self_weight=0.5,
                                            undirected=True)

            # Test early termination using max_iteration
            max_iter = 3
            m = tc.label_propagation.create(g,
                                            label_field='label',
                                            threshold=1e-10,
                                            max_iterations=max_iter)
            self.assertEqual(m.num_iterations, max_iter)

            # Test that the predict class should be None if all class probabilities are equal
            g = g.add_vertices(tc.SFrame({'__id': [-1]}))
            m = tc.label_propagation.create(g,
                                            label_field='label',
                                            threshold=1e-10,
                                            max_iterations=max_iter)
            result = m.graph.vertices
            self.assertEqual(
                result[result['__id'] == -1]['predicted_label'][0], None)
Пример #15
0
    # selected_data = removeFront(selected_data)

    # user_interface(SELECTED_DATA)

    # return selected_data


# ===================================主执行部分===================================
# 执行UI
var_list= []

# 新建精选数据库
SELECTED_DATA = tc.SFrame({'code': ['000000'], 'name': ['数据不存在'],'bankuai': ['二次元'],
                      'close': [0.0], 'percent_chg': [0.0],'change': [0.0],
                      'volume': [0.0], 'turn_volume': [0.0],
                      'amplitude': [0.0],'volume_rate': [0.0],'turnover_rate': [0.0],
                      'news_url': ['http://www.bilibili.com'], 'income_increase': [0.0], 'profit_increase': [0.0]})

# UI
user_interface(SELECTED_DATA)









def create(dataset, target, model_name, features=None,
           validation_set='auto', distributed='auto',
           verbose=True, seed=None, **kwargs):
    """
    Create a :class:`~turicreate.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel This function is normally not called, call
    specific model's create function instead

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    features : list[string], optional
        List of feature names used by feature column

    validation_set : SFrame, optional

        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    distributed: env
        The distributed environment

    verbose : boolean
        whether print out messages during training

    seed : int, optional
        Seed for random number generation. Set this value to ensure that the
        same model is created every time.

    kwargs : dict
        Additional parameter options that can be passed
    """

    _raise_error_if_not_sframe(dataset, "training dataset")

    # Determine columns to keep
    if features is None:
        features = [feat for feat in dataset.column_names() if feat != target]
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError(
            "Invalid feature %s: Feature names must be of type str" % x)

    # Create a validation set
    if isinstance(validation_set, str):
        if validation_set == 'auto':
            if dataset.num_rows() >= 100:
                if verbose:
                    print_validation_track_notification()
                dataset, validation_set = dataset.random_split(.95, seed=seed)
            else:
                validation_set = None
        else:
            raise TypeError('Unrecognized value for validation_set.')
    if validation_set is None:
        validation_set = _turicreate.SFrame()
    else:
        if not isinstance(validation_set, _turicreate.SFrame):
            raise TypeError("validation_set must be either 'auto' or an SFrame "
                            "matching the training data.")

        # Attempt to append the two datasets together to check schema
        validation_set.head().append(dataset.head())

        # Reduce validation set to requested columns
        validation_set = _toolkits_select_columns(
            validation_set, features + [target])

    # Reduce training set to requested columns
    dataset = _toolkits_select_columns(dataset, features + [target])

    # Sanitize model-specific options
    options = {k.lower(): kwargs[k] for k in kwargs}

    # Create a model instance and train it
    model = _turicreate.extensions.__dict__[model_name]()
    with QuietProgress(verbose):
        model.train(dataset, target, validation_set, options)

    return SupervisedLearningModel(model, model_name)
Пример #17
0
def parseSFrame(fileName):
    filePath = './SelectedData/' + fileName + '/'
    SELECTED_DATA = tc.SFrame(data=filePath)
 def test_create_with_missing_value(self):
     sf = self.train.append(tc.SFrame({self.feature: tc.SArray([None], dtype=tc.Image), self.target: [self.train[self.target][0]]}))
     with self.assertRaises(_ToolkitError):
         tc.one_shot_object_detector.create(sf, target=self.target)
Пример #19
0

# ### From the above summary, we select the Cosine similarity on scaled number of seconds approach as our final model, because this combination gives the best results (the desirable outcome has low RMSE and precision-recall close to 1).

# ## 9. Final Output

# In[32]:

# data_norm


# In[31]:

# rerun the model using the whole dataset, as we came to a final model using train data and evaluated with test set.

final_model = tc.item_similarity_recommender.create(tc.SFrame(data_norm), 
                                            user_id=user_id, 
                                            item_id=item_id,
                                            target ='Scaled_SecNumF',
                                            similarity_type = 'cosine' )

recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)


# #### 9.1. CSV output file

# In[32]:

df_rec = recom.to_dataframe()
print(df_rec.shape)
# Turi is a platform for Machine Learning
# turicreate is Apple's version of Turi
# https://apple.github.io/turicreate/docs/api/index.html#

import turicreate as turi

#import the data
url = "hypothyroiddataset/hypothyroid.data.csv"

#save into sframe
data = turi.SFrame(url)

#pretty prints data
data.explore()
#shows graph of data
data.show()

#split the data into 80% training, 20% evaluation
trainingBuffers, testingBuffers = data.random_split(0.80)

#create the model using 'classification'
#model = turi.classifier.create(trainingBuffers,
#                              target='diagnosis')
model = turi.random_forest_classifier.create(trainingBuffers,
                                             target='diagnosis')

#evaluate the model
evaluations = model.evaluate(testingBuffers)
print evaluations["accuracy"]

#save & export
Пример #21
0
    def setUp(self):
        data = tc.SFrame()
        data["user_id"] = ["a", "b", "b", "c", "c", "c"]
        data["item_id"] = ["x", "x", "y", "v", "w", "z"]
        data["rating"] = [0, 1, 2, 3, 4, 5]

        # Make internal indices so that we can check predictions/ranking.
        # IDs are in the order they are seen in the above data SFrame.
        user_index = {"a": 0, "b": 1, "c": 2}
        item_index = {"x": 0, "y": 1, "v": 2, "w": 3, "z": 4}
        user_data = tc.SFrame()
        user_data["user_id"] = ["a", "b"]
        user_data["user_feature_value"] = [0.5, 0.9]
        user_data["user_dict_value"] = [{1: 0.5}, {4: 0.9}]
        user_data["user_vect_value"] = [[0, 1, 2], [2, 3, 4]]
        user_data["user_str_dict_value"] = [{"tt": 0.5}, {"ttt": 0.9}]
        item_data = tc.SFrame()
        item_data["item_id"] = ["x", "v", "w", "y"]
        item_data["item_feature_value"] = [-0.3, 0.7, 0.3, 0.05]
        item_data["item_dict_value"] = [{
            1: 0.5
        }, {
            4: 0.9
        }, {
            4: 0.9
        }, {
            5: 1,
            6: 2
        }]
        item_data["item_vect_value"] = [[0, 1, 2], [2, 3, 4], [2, 3, 4],
                                        [2, 3, 5]]
        item_data["item_str_dict_value"] = [
            {
                "tt": 0.5
            },
            {
                "tt": 0.9
            },
            {
                "t": 0.9
            },
            {
                "ttt": 0.9
            },
        ]
        new_data = tc.SFrame()
        new_data["user_id"] = ["a", "b"]
        new_data["item_id"] = ["v", "z"]
        new_data["rating"] = [7, 8]
        new_user_data = tc.SFrame()
        new_user_data["user_id"] = ["a", "c"]
        new_user_data["user_feature_value"] = [0.0, 2.9]
        new_user_data["user_dict_value"] = [{1: 0.5}, {4: 0.9}]
        new_user_data["user_vect_value"] = [[0, 1, 2], [2, 3, 4]]
        new_user_data["user_str_dict_value"] = [{"tt": 0.5}, {"ttt": 0.9}]

        new_item_data = tc.SFrame()
        new_item_data["item_id"] = ["y", "z"]
        new_item_data["item_feature_value"] = [0.5, 0.6]
        new_item_data["item_dict_value"] = [{1: 0.5}, {4: 0.9}]
        new_item_data["item_vect_value"] = [[0, 1, 2], [2, 3, 4]]
        new_item_data["item_str_dict_value"] = [{"tt": 0.5}, {"ttt": 0.9}]

        exclude = tc.SFrame()
        exclude["user_id"] = ["a"]
        exclude["item_id"] = ["x"]

        users_all = tc.SArray(["a", "b", "c"])
        items_all = tc.SArray(["v", "w", "x", "y", "z"])
        items_some = tc.SArray(["v", "w"])

        self.data = data
        self.user_data = user_data
        self.item_data = item_data
        self.new_data = new_data
        self.new_user_data = new_user_data
        self.new_item_data = new_item_data
        self.exclude = exclude
        self.users_all = users_all
        self.items_all = items_all
        self.items_some = items_some
        self.user_index = user_index
        self.item_index = item_index
Пример #22
0
import turicreate as tc
import coremltools

print("set_num_gpus")
# configure the GPUs
tc.config.set_num_gpus(0)

print("Load SFrame")
# Load SFrame
data = tc.SFrame('/storage/xy_signs.sframe')

print("split")
# Make a train-test split
train_data, test_data = data.random_split(0.8)

print("create and train")
# Create and train model
model = tc.object_detector.create(train_data)
model.evaluate(test_data)
model.export_coreml('/storage/xy_signs.mlmodel')

# reduce model size
model_spec = coremltools.utils.load_spec('/storage/xy_signs.mlmodel')
model_fp16_spec = coremltools.utils.convert_neural_network_spec_weights_to_fp16(
    model_spec)
coremltools.utils.save_spec(model_fp16_spec, '/storage/xy_signs_16bit.mlmodel')
Пример #23
0
    def evaluate(self, dataset, metric='auto', verbose=True, batch_size=64):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset to use for evaluation, must include a column with the same
            name as the features used for model training. Additional columns
            are ignored.

        metric : str, optional
            Name of the evaluation metric.  Possible values are:

            - 'auto'             : Returns all available metrics.
            - 'accuracy'         : Classification accuracy (micro average).
            - 'auc'              : Area under the ROC curve (macro average)
            - 'precision'        : Precision score (macro average)
            - 'recall'           : Recall score (macro average)
            - 'f1_score'         : F1 score (macro average)
            - 'log_loss'         : Log loss
            - 'confusion_matrix' : An SFrame with counts of possible
                                   prediction/true label combinations.
            - 'roc_curve'        : An SFrame containing information needed for an
                                   ROC curve

        verbose : bool, optional
            If True, prints progress updates and model details.

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve performance.

        Returns
        -------
        out : dict
            Dictionary of evaluation results where the key is the name of the
            evaluation metric (e.g. `accuracy`) and the value is the evaluation
            score.

        See Also
        ----------
        classify, predict

        Examples
        ----------
        .. sourcecode:: python

          >>> results = model.evaluate(data)
          >>> print results['accuracy']
        """
        from turicreate.toolkits import evaluation

        # parameter checking
        if not isinstance(dataset, _tc.SFrame):
            raise TypeError('\'dataset\' parameter must be an SFrame')

        avail_metrics = [
            'accuracy', 'auc', 'precision', 'recall', 'f1_score', 'log_loss',
            'confusion_matrix', 'roc_curve'
        ]
        _tk_utils._check_categorical_option_type('metric', metric,
                                                 avail_metrics + ['auto'])

        if metric == 'auto':
            metrics = avail_metrics
        else:
            metrics = [metric]

        if _is_deep_feature_sarray(dataset[self.feature]):
            deep_features = dataset[self.feature]
        else:
            deep_features = get_deep_features(dataset[self.feature],
                                              verbose=verbose)
        data = _tc.SFrame({'deep features': deep_features})
        data = data.add_row_number()
        missing_ids = data.filter_by([[]], 'deep features')['id']

        if len(missing_ids) > 0:
            data = data.filter_by([[]], 'deep features', exclude=True)
            # Remove the labels for entries without deep features
            _logging.warning(
                "Dropping %d examples which are less than 975ms in length." %
                len(missing_ids))
            labels = dataset[[self.target]].add_row_number()
            labels = data.join(labels, how='left')[self.target]
        else:
            labels = dataset[self.target]
        assert (len(labels) == len(data))

        if any([m in metrics for m in ('roc_curve', 'log_loss', 'auc')]):
            probs = self.predict(data['deep features'],
                                 output_type='probability_vector',
                                 verbose=verbose,
                                 batch_size=batch_size)
        if any([
                m in metrics for m in ('accuracy', 'precision', 'recall',
                                       'f1_score', 'confusion_matrix')
        ]):
            classes = self.predict(data['deep features'],
                                   output_type='class',
                                   verbose=verbose,
                                   batch_size=batch_size)

        ret = {}
        if 'accuracy' in metrics:
            ret['accuracy'] = evaluation.accuracy(labels, classes)
        if 'auc' in metrics:
            ret['auc'] = evaluation.auc(labels,
                                        probs,
                                        index_map=self._class_label_to_id)
        if 'precision' in metrics:
            ret['precision'] = evaluation.precision(labels, classes)
        if 'recall' in metrics:
            ret['recall'] = evaluation.recall(labels, classes)
        if 'f1_score' in metrics:
            ret['f1_score'] = evaluation.f1_score(labels, classes)
        if 'log_loss' in metrics:
            ret['log_loss'] = evaluation.log_loss(
                labels, probs, index_map=self._class_label_to_id)
        if 'confusion_matrix' in metrics:
            ret['confusion_matrix'] = evaluation.confusion_matrix(
                labels, classes)
        if 'roc_curve' in metrics:
            ret['roc_curve'] = evaluation.roc_curve(
                labels, probs, index_map=self._class_label_to_id)

        return ret
import turicreate as tc

# Load sessions from preprocessed data
data = tc.SFrame('hapt_data.sframe')

# Train/test split by recording sessions
train, test = tc.activity_classifier.util.random_split_by_session(
    data, session_id='exp_id', fraction=0.8)

# Create an activity classifier
model = tc.activity_classifier.create(train,
                                      session_id='exp_id',
                                      target='activity',
                                      prediction_window=50)

# Evaluate the model and save the results into a dictionary
metrics = model.evaluate(test)
print(metrics['accuracy'])

# Save the model for later use in Turi Create
model.save('TuriActivityClassify.model')

# Export for use in Core ML
model.export_coreml('CoreMLActivityClassify.mlmodel')
Пример #25
0
    def predict(self,
                dataset,
                output_type='class',
                verbose=True,
                batch_size=64):
        """
        Return predictions for ``dataset``. Predictions can be generated
        as class labels or probabilities.

        Parameters
        ----------
        dataset : SFrame | SArray | dict
            The audio data to be classified.
            If dataset is an SFrame, it must have a column with the same name as
            the feature used for model training, but does not require a target
            column. Additional columns are ignored.

        output_type : {'probability', 'class', 'probability_vector'}, optional
            Form of the predictions which are one of:

            - 'class': Class prediction. For multi-class classification, this
              returns the class with maximum probability.
            - 'probability': Prediction probability associated with the True
              class (not applicable for multi-class classification)
            - 'probability_vector': Prediction probability associated with each
              class as a vector. Label ordering is dictated by the ``classes``
              member variable.

        verbose : bool, optional
            If True, prints progress updates and model details.

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve performance.

        Returns
        -------
        out : SArray
            An SArray with the predictions.

        See Also
        ----------
        evaluate, classify

        Examples
        ----------
        >>> probability_predictions = model.predict(data, output_type='probability')
        >>> prediction_vector = model.predict(data, output_type='probability_vector')
        >>> class_predictions = model.predict(data, output_type='class')

        """
        from .._mxnet import _mxnet_utils
        import mxnet as mx

        if not isinstance(dataset, (_tc.SFrame, _tc.SArray, dict)):
            raise TypeError(
                '\'dataset\' parameter must be either an SFrame, SArray or dictionary'
            )

        if isinstance(dataset, dict):
            if (set(dataset.keys()) != {'sample_rate', 'data'}):
                raise ValueError(
                    '\'dataset\' parameter is a dictionary but does not appear to be audio data.'
                )
            dataset = _tc.SArray([dataset])
        elif isinstance(dataset, _tc.SFrame):
            dataset = dataset[self.feature]

        if not _is_deep_feature_sarray(dataset) and not _is_audio_data_sarray(
                dataset):
            raise ValueError(
                '\'dataset\' must be either audio data or audio deep features.'
            )

        if output_type not in ('probability', 'probability_vector', 'class'):
            raise ValueError(
                '\'dataset\' parameter must be either an SFrame, SArray or dictionary'
            )
        if output_type == 'probability' and self.num_classes != 2:
            raise _ToolkitError(
                'Output type \'probability\' is only supported for binary'
                ' classification. For multi-class classification, use'
                ' predict_topk() instead.')
        if (batch_size < 1):
            raise ValueError("'batch_size' must be greater than or equal to 1")

        if _is_deep_feature_sarray(dataset):
            deep_features = dataset
        else:
            deep_features = get_deep_features(dataset, verbose=verbose)

        deep_features = _tc.SFrame({'deep features': deep_features})
        deep_features = deep_features.add_row_number()
        deep_features = deep_features.stack('deep features',
                                            new_column_name='deep features')
        deep_features, missing_ids = deep_features.dropna_split(
            columns=['deep features'])

        if len(missing_ids) > 0:
            _logging.warning(
                "Unable to make predictions for %d examples because they are less than 975ms in length."
                % len(missing_ids))

        if batch_size > len(deep_features):
            batch_size = len(deep_features)

        y = []
        for batch in mx.io.NDArrayIter(
                deep_features['deep features'].to_numpy(),
                batch_size=batch_size):
            ctx = _mxnet_utils.get_mxnet_context()
            if (len(batch.data[0]) < len(ctx)):
                ctx = ctx[:len(batch.data[0])]

            batch_data = batch.data[0]
            if batch.pad != 0:
                batch_data = batch_data[:-batch.
                                        pad]  # prevent batches looping back

            batch_data = mx.gluon.utils.split_and_load(batch_data,
                                                       ctx_list=ctx,
                                                       batch_axis=0,
                                                       even_split=False)

            for x in batch_data:
                forward_output = self._custom_classifier.forward(x)
                y += mx.nd.softmax(forward_output).asnumpy().tolist()
        assert (len(y) == len(deep_features))

        # Combine predictions from multiple frames
        sf = _tc.SFrame({'predictions': y, 'id': deep_features['id']})
        probabilities_sum = sf.groupby(
            'id', {'prob_sum': _tc.aggregate.SUM('predictions')})

        if output_type == 'class':
            predicted_ids = probabilities_sum['prob_sum'].apply(
                lambda x: _np.argmax(x))
            mappings = self._id_to_class_label
            probabilities_sum['results'] = predicted_ids.apply(
                lambda x: mappings[x])
        else:
            assert output_type in ('probability', 'probability_vector')
            frame_per_example_count = sf.groupby('id', _tc.aggregate.COUNT())
            probabilities_sum = probabilities_sum.join(frame_per_example_count)
            probabilities_sum['results'] = probabilities_sum.apply(
                lambda row: [i / row['Count'] for i in row['prob_sum']])

        if len(missing_ids) > 0:
            output_type = probabilities_sum['results'].dtype
            missing_predictions = _tc.SFrame({
                'id':
                missing_ids['id'],
                'results':
                _tc.SArray([None] * len(missing_ids), dtype=output_type)
            })
            probabilities_sum = probabilities_sum[[
                'id', 'results'
            ]].append(missing_predictions)

        probabilities_sum = probabilities_sum.sort('id')
        return probabilities_sum['results']
Пример #26
0
    def extract_features(self,
                         dataset,
                         feature,
                         batch_size=512,
                         verbose=False):
        """
        Parameters
        ----------
        dataset: SFrame
            SFrame of images
        """
        from ..mx import SFrameImageIter as _SFrameImageIter
        import turicreate as _tc
        import array

        if len(dataset) == 0:
            return _tc.SArray([], array.array)

        # Resize images if needed
        preprocessed_dataset = _tc.SFrame()
        if verbose:
            print("Resizing images...")
        preprocessed_dataset[feature] = _tc.image_analysis.resize(
            dataset[feature], *tuple(reversed(self.image_shape)))

        batch_size = min(len(dataset), batch_size)
        # Make a data iterator
        dataIter = _SFrameImageIter(sframe=preprocessed_dataset,
                                    data_field=[feature],
                                    batch_size=batch_size)

        # Setup the MXNet model
        model = MXFeatureExtractor._get_mx_module(self.ptModel.mxmodel,
                                                  self.data_layer,
                                                  self.feature_layer,
                                                  self.context,
                                                  self.image_shape, batch_size)

        out = _tc.SArrayBuilder(dtype=array.array)
        num_processed = 0
        if verbose:
            print("Performing feature extraction on resized images...")
        while dataIter.has_next:
            if dataIter.data_shape[1:] != self.image_shape:
                raise RuntimeError(
                    "Expected image of size %s. Got %s instead." %
                    (self.image_shape, dataIter.data_shape[1:]))
            model.forward(next(dataIter))
            mx_out = model.get_outputs()[0].asnumpy()
            if dataIter.getpad() != 0:
                # If batch size is not evenly divisible by the length, it will loop back around.
                # We don't want that.
                mx_out = mx_out[:-dataIter.getpad()]
            out.append_multiple(mx_out)

            num_processed += batch_size
            num_processed = min(len(dataset), num_processed)
            if verbose:
                print('Completed {num_processed:{width}d}/{total:{width}d}'.
                      format(num_processed=num_processed,
                             total=len(dataset),
                             width=len(str(len(dataset)))))

        return out.close()
Пример #27
0
def create(dataset,
           target,
           features=None,
           validation_set="auto",
           verbose=True):
    """
    Automatically create a suitable regression model based on the provided
    training data.

    To use specific options of a desired model, use the ``create`` function
    of the corresponding model.

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : str
        The name of the column in ``dataset`` that is the prediction target.
        This column must have a numeric type (int/float).

    features : list[string], optional
        Names of the columns containing features. 'None' (the default) indicates
        that all columns except the target variable should be used as features.

        The features are columns in the input SFrame that can be of the
        following types:

        - *Numeric*: values of numeric type integer or float.

        - *Categorical*: values of type string.

        - *Array*: list of numeric (integer or float) values. Each list element
          is treated as a separate feature in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values
          Each key of a dictionary is treated as a separate feature and the
          value in the dictionary corresponds to the value of the feature.
          Dictionaries are ideal for representing sparse data.

        Columns of type *list* are not supported. Convert such feature
        columns to type array if all entries in the list are of numeric
        types. If the lists contain data of mixed types, separate
        them out into different columns.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.  For
        each row of the progress table, the chosen metrics are computed for
        both the provided training dataset and the validation_set. The format
        of this SFrame must be the same as the training set.  By default this
        argument is set to 'auto' and a validation set is automatically sampled
        and used for progress printing. If validation_set is set to None, then
        no additional metrics are computed. The default value is 'auto'.


    verbose : boolean, optional
        If True, print progress information during training.

    Returns
    -------
      out : A trained regression model.

    See Also
    --------
    turicreate.linear_regression.LinearRegression,
    turicreate.boosted_trees_regression.BoostedTreesRegression

    Examples
    --------
    .. sourcecode:: python

      # Setup the data
      >>> import turicreate as tc
      >>> data =  tc.SFrame('https://static.turi.com/datasets/regression/houses.csv')

      # Selects the best model based on your data.
      >>> model = tc.regression.create(data, target='price',
      ...                                  features=['bath', 'bedroom', 'size'])

      # Make predictions and evaluate results.
      >>> predictions = model.predict(data)
      >>> results = model.evaluate(data)

      # Setup the data
      >>> import turicreate as tc
      >>> data =  tc.SFrame('https://static.turi.com/datasets/regression/houses.csv')

      # Selects the best model based on your data.
      >>> model = tc.regression.create(data, target='price',
      ...                                  features=['bath', 'bedroom', 'size'])

      # Make predictions and evaluate results.
      >>> predictions = model.predict(data)
      >>> results = model.evaluate(data)

    """

    dataset, validation_set = _validate_data(dataset, target, features,
                                             validation_set)
    if validation_set is None:
        validation_set = _turicreate.SFrame()

    model_proxy = _turicreate.extensions.create_automatic_regression_model(
        dataset, target, validation_set, {})

    return _sl.wrap_model_proxy(model_proxy)
Пример #28
0
# In[1]:

import pandas as pd
import numpy as np
import turicreate as tc

# In[2]:

#Read CSV
beer2 = pd.read_csv('/beer2.csv')

# In[7]:

#Create dataframe of required columns then convert to SFrame for turicreate
beer2_1 = beer2[['userId', 'beer_beerid', 'review_overall']]
beer2_1 = tc.SFrame(beer2_1)
beer2_1 = beer2_1.dropna()

# In[8]:

#Create SFrame of additional info on beers for model
beer_info = beer2[['beer_beerid', 'beer_style', 'beer_abv']].drop_duplicates()
beer_info = tc.SFrame(beer_info)

# In[9]:

#Create training and validation set
training_data, validation_data = tc.recommender.util.random_split_by_user(
    beer2_1, 'userId', 'beer_beerid')

# In[10]:
Пример #29
0
    def query(self,
              dataset,
              label=None,
              k=5,
              radius=None,
              verbose=True,
              batch_size=64):
        """
        For each image, retrieve the nearest neighbors from the model's stored
        data. In general, the query dataset does not need to be the same as
        the reference data stored in the model.

        Parameters
        ----------
        dataset : SFrame | SArray | turicreate.Image
            Query data.
            If dataset is an SFrame, it must contain columns with the same
            names and types as the features used to train the model.
            Additional columns are ignored.

        label : str, optional
            Name of the query SFrame column with row labels. If 'label' is not
            specified, row numbers are used to identify query dataset rows in
            the output SFrame.

        k : int, optional
            Number of nearest neighbors to return from the reference set for
            each query observation. The default is 5 neighbors, but setting it
            to ``None`` will return all neighbors within ``radius`` of the
            query point.

        radius : float, optional
            Only neighbors whose distance to a query point is smaller than this
            value are returned. The default is ``None``, in which case the
            ``k`` nearest neighbors are returned for each query point,
            regardless of distance.

        verbose: bool, optional
            If True, print progress updates and model details.

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve performance.

        Returns
        -------
        out : SFrame
            An SFrame with the k-nearest neighbors of each query observation.
            The result contains four columns: the first is the label of the
            query observation, the second is the label of the nearby reference
            observation, the third is the distance between the query and
            reference observations, and the fourth is the rank of the reference
            observation among the query's k-nearest neighbors.

        See Also
        --------
        similarity_graph

        Notes
        -----
        - If both ``k`` and ``radius`` are set to ``None``, each query point
          returns all of the reference set. If the reference dataset has
          :math:`n` rows and the query dataset has :math:`m` rows, the output
          is an SFrame with :math:`nm` rows.

        Examples
        --------
        >>> model.query(queries, 'label', k=2)
        +-------------+-----------------+----------------+------+
        | query_label | reference_label |    distance    | rank |
        +-------------+-----------------+----------------+------+
        |      0      |        2        | 0.305941170816 |  1   |
        |      0      |        1        | 0.771556867638 |  2   |
        |      1      |        1        | 0.390128184063 |  1   |
        |      1      |        0        | 0.464004310325 |  2   |
        |      2      |        0        | 0.170293863659 |  1   |
        |      2      |        1        | 0.464004310325 |  2   |
        +-------------+-----------------+----------------+------+
        """
        if not isinstance(dataset, (_tc.SFrame, _tc.SArray, _tc.Image)):
            raise TypeError(
                'dataset must be either an SFrame, SArray or turicreate.Image')
        if (batch_size < 1):
            raise ValueError("'batch_size' must be greater than or equal to 1")

        if isinstance(dataset, _tc.SArray):
            dataset = _tc.SFrame({self.feature: dataset})
        elif isinstance(dataset, _tc.Image):
            dataset = _tc.SFrame({self.feature: [dataset]})

        extracted_features = self._extract_features(dataset,
                                                    verbose=verbose,
                                                    batch_size=batch_size)
        if label is not None:
            extracted_features[label] = dataset[label]
        return self.similarity_model.query(extracted_features, label, k,
                                           radius, verbose)
Пример #30
0
import turicreate as tc
tc.config.set_num_gpus(-1)

# Load the data
data = tc.SFrame('plate.sframe')

# Make a train-test split
train_data, test_data = data.random_split(0.8)

# Create a model
model = tc.object_detector.create(train_data)

# Save predictions to an SArray
predictions = model.predict(test_data)

# Evaluate the model and save the results into a dictionary
metrics = model.evaluate(test_data)

# Save the model for later use in Turi Create
model.save('model_plate_turi.model')

# Export for use in Core ML
model.export_coreml('model_plate_turi.mlmodel')