Python _raise_error_if_not_sarray примеры, turicreate.toolkits._internal_utils._raise_error_if_not_sarray Python примеры использования

Пример #1

0

Показать файл

Файл: test_image_classifier.py Проект: lukereichold/turicreate

 def test_sarray(self):
     model = self.model
     sa = data[self.feature]
     predictions = model.predict(sa)
     _raise_error_if_not_sarray(predictions)
     predictions = model.predict_topk(sa, k=2)
     _raise_error_if_not_sframe(predictions)
     predictions = model.classify(sa)
     _raise_error_if_not_sframe(predictions)

Пример #2

0

Показать файл

 def test_sarray(self):
     model = self.model
     data = self.sf[self.feature]
     predictions = model.predict(data)
     _raise_error_if_not_sarray(predictions)
     predictions = model.predict_topk(data)
     _raise_error_if_not_sframe(predictions)
     predictions = model.classify(data)
     _raise_error_if_not_sframe(predictions)

Пример #3

0

Показать файл

Файл: test_image_classifier.py Проект: lukereichold/turicreate

 def test_predict(self):
     model = self.model
     for output_type in ["class", "probability_vector"]:
         preds = model.predict(data.head(), output_type=output_type)
         _raise_error_if_not_sarray(preds)
         self.assertEqual(len(preds), len(data.head()))
         if output_type == "class":
             self.assertTrue(all(preds[:5] == "white"))
             self.assertTrue(all(preds[5:] == "black"))

Пример #4

0

Показать файл

 def test_predict(self):
     model = self.model
     for output_type in ['class', 'probability_vector']:
         preds = model.predict(data.head(), output_type=output_type)
         _raise_error_if_not_sarray(preds)
         self.assertEqual(len(preds), len(data.head()))
         if output_type == 'class':
             self.assertTrue(all(preds[:5] == 'white'))
             self.assertTrue(all(preds[5:] == 'black'))

Пример #5

0

Показать файл

Файл: evaluation.py Проект: zxybdfz/turicreate

def _supervised_evaluation_error_checking(targets, predictions):
    """
    Perform basic error checking for the evaluation metrics. Check
    types and sizes of the inputs.
    """
    _raise_error_if_not_sarray(targets, "targets")
    _raise_error_if_not_sarray(predictions, "predictions")
    if (len(targets) != len(predictions)):
        raise _ToolkitError(
         "Input SArrays 'targets' and 'predictions' must be of the same length.")

Пример #6

0

Показать файл

    def test_predict(self):
        # default ('class') output_type
        predictions = self.model.predict(self.data['audio'])
        _raise_error_if_not_sarray(predictions)
        self.assertEqual(len(predictions), len(self.data))
        for a, b in zip(predictions, self.data['labels']):
            self.assertEqual(a, b)

        # 'probability' output_type
        if self.is_binary_classification:
            predictions = self.model.predict(self.data['audio'],
                                             output_type='probability')
            _raise_error_if_not_sarray(predictions)
            self.assertEqual(len(predictions), len(self.data))
            for probabilities, correct_label in zip(predictions,
                                                    self.data['labels']):
                # correct value has highest probability?
                correct_index = self.model.classes.index(correct_label)
                self.assertEqual(np.argmax(probabilities), correct_index)
                # all probabilities sum close to 1?
                self.assertTrue(abs(np.sum(probabilities) - 1) < 0.00001)
        else:
            # 'probability' output type only supported for binary classification
            with self.assertRaises(ToolkitError):
                self.model.predict(self.data['audio'],
                                   output_type='probability')

        # 'probability_vector' output_type
        predictions = self.model.predict(self.data['audio'],
                                         output_type='probability_vector')
        _raise_error_if_not_sarray(predictions)
        self.assertEqual(len(predictions), len(self.data))
        for prob_vector, correct_label in zip(predictions,
                                              self.data['labels']):
            # correct value has highest probability?
            correct_index = self.model.classes.index(correct_label)
            self.assertEqual(np.argmax(prob_vector), correct_index)
            # all probabilities sum close to 1?
            self.assertTrue(abs(np.sum(prob_vector) - 1) < 0.00001)

        # predict with single (dict) example
        single_prediction = self.model.predict(self.data['audio'][0])
        _raise_error_if_not_sarray(single_prediction)
        self.assertEqual(len(single_prediction), 1)
        self.assertEqual(single_prediction[0], self.data['labels'][0])

        # predict with SFrame
        data = self.data.copy()
        del data['labels']
        predictions = self.model.predict(data)
        _raise_error_if_not_sarray(predictions)
        self.assertEqual(len(predictions), len(data))
        for a, b in zip(predictions, self.data['labels']):
            self.assertEqual(a, b)

Пример #7

0

Показать файл

Файл: _output_formats.py Проект: chrinide/turicreate

def stack_annotations(annotations_sarray):
    """
    Converts object detection annotations (ground truth or predictions) to
    stacked format (an `SFrame` where each row is one object instance).

    Parameters
    ----------
    annotations_sarray: SArray
        An `SArray` with unstacked predictions, exactly formatted as the
        annotations column when training an object detector or when making
        predictions.

    Returns
    -------
    annotations_sframe: An `SFrame` with stacked annotations.

    See also
    --------
    unstack_annotations

    Examples
    --------
    Predictions are returned by the object detector in unstacked format:

    >>> predictions = detector.predict(images)

    By converting it to stacked format, it is easier to get an overview of
    object instances:

    >>> turicreate.object_detector.util.stack_annotations(predictions)
    Data:
    +--------+------------+-------+-------+-------+-------+--------+
    | row_id | confidence | label |   x   |   y   | width | height |
    +--------+------------+-------+-------+-------+-------+--------+
    |   0    |    0.98    |  dog  | 123.0 | 128.0 |  80.0 | 182.0  |
    |   0    |    0.67    |  cat  | 150.0 | 183.0 | 129.0 | 101.0  |
    |   1    |    0.8     |  dog  |  50.0 | 432.0 |  65.0 |  98.0  |
    +--------+------------+-------+-------+-------+-------+--------+
    [3 rows x 7 columns]
    """
    _raise_error_if_not_sarray(annotations_sarray,
                               variable_name="annotations_sarray")
    sf = _tc.SFrame({
        "annotations": annotations_sarray
    }).add_row_number("row_id")
    sf = sf.stack("annotations", new_column_name="annotations", drop_na=True)
    if len(sf) == 0:
        cols = ["row_id", "confidence", "label", "height", "width", "x", "y"]
        return _tc.SFrame({k: [] for k in cols})
    sf = sf.unpack("annotations", column_name_prefix="")
    sf = sf.unpack("coordinates", column_name_prefix="")
    del sf["type"]
    return sf

Пример #8

0

Показать файл

def tf_idf(dataset):
    """
    Compute the TF-IDF scores for each word in each document. The collection
    of documents must be in bag-of-words format.

    .. math::
        \mbox{TF-IDF}(w, d) = tf(w, d) * log(N / f(w))

    where :math:`tf(w, d)` is the number of times word :math:`w` appeared in
    document :math:`d`, :math:`f(w)` is the number of documents word :math:`w`
    appeared in, :math:`N` is the number of documents, and we use the
    natural logarithm.

    This function is implemented using

    Parameters
    ----------
    dataset : SArray[str | dict | list]
        Input text data. 

    Returns
    -------
    out : SArray[dict]
        The same document corpus where each score has been replaced by the
        TF-IDF transformation.

    See Also
    --------
    count_words, count_ngrams, tokenize,

    References
    ----------
    - `Wikipedia - TF-IDF <https://en.wikipedia.org/wiki/TFIDF>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import turicreate

        >>> docs = turicreate.SArray('https://static.turi.com/datasets/nips-text')
        >>> docs_tfidf = turicreate.text_analytics.tf_idf(docs)
    """
    _raise_error_if_not_sarray(dataset, "dataset")

    if len(dataset) == 0:
        return _turicreate.SArray()

    dataset = _turicreate.SFrame({'docs': dataset})
    scores = _feature_engineering.TFIDF('docs').fit_transform(dataset)

    return scores['docs']

Пример #9

0

Показать файл

    def test_predict(self):
        sf = self.test.head()
        pred = self.model.predict(sf.head())

        # Check the structure of the output
        _raise_error_if_not_sarray(pred)
        self.assertEqual(len(pred), len(sf))

        # Make sure SFrame was not altered
        self.assertEqual([col for col in sf.column_names() if col.startswith("_")], [])

        # Predict should work on no input (and produce no predictions)
        pred0 = self.model.predict(sf[self.feature][:0])
        self.assertEqual(len(pred0), 0)

Пример #10

0

Показать файл

Файл: test_object_detector.py Проект: zoid-anurag/turicreate

    def test_predict(self):
        sf = self.sf.head()
        # Make sure this does not need the annotations column to work
        del sf[self.annotations]

        pred = self.model.predict(sf.head())

        # Check the structure of the output
        _raise_error_if_not_sarray(pred)
        self.assertEquals(len(pred), len(sf))

        # Make sure SFrame was not altered
        self.assertEquals(
            [col for col in sf.column_names() if col.startswith('_')], [])

        # Predict should work on no input (and produce no predictions)
        pred0 = self.model.predict(sf[:0])
        self.assertEquals(len(pred0), 0)

Пример #11

0

Показать файл

def tokenize(sa,
             to_lower=False,
             delimiters=["\r", "\v", "\n", "\f", "\t", " "]):
    """
    tokenize(sa, to_lower=False, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "])

    Tokenize the input SArray of text strings and return the list of tokens.

    Parameters
    ----------
    sa : SArray[str]
        Input data of strings representing English text. This tokenizer is not
        intended to process XML, HTML, or other structured text formats.

    to_lower : bool, optional
        If True, all strings are converted to lower case before tokenization.

    delimiters : list[str], None, optional
        Input strings are tokenized using delimiter characters in this list.
        Each entry in this list must contain a single character. If set to
        `None`, then a Penn treebank-style tokenization is used, which contains
        smart handling of punctuations.

    Returns
    -------
    out : SArray[list]
        Each text string in the input is mapped to a list of tokens.

    See Also
    --------
    count_words, count_ngrams, tf_idf

    References
    ----------
    - `Penn treebank tokenization <https://web.archive.org/web/19970614072242/http://www.cis.upenn.edu:80/~treebank/tokenization.html>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import turicreate

        >>> docs = turicreate.SArray(['This is the first sentence.',
        ...                         'This one, it\'s the second sentence.'])

        # Default tokenization by space characters
        >>> turicreate.text_analytics.tokenize(docs)
        dtype: list
        Rows: 2
        [['This', 'is', 'the', 'first', 'sentence.'],
         ['This', 'one,', "it's", 'the', 'second', 'sentence.']]

        # Penn treebank-style tokenization
        >>> turicreate.text_analytics.tokenize(docs, delimiters=None)
        dtype: list
        Rows: 2
        [['This', 'is', 'the', 'first', 'sentence', '.'],
         ['This', 'one', ',', 'it', "'s", 'the', 'second', 'sentence', '.']]

    """
    _raise_error_if_not_sarray(sa, "sa")

    ## Compute word counts
    sf = _turicreate.SFrame({'docs': sa})
    fe = _feature_engineering.Tokenizer(features='docs',
                                        to_lower=to_lower,
                                        delimiters=delimiters,
                                        output_column_prefix=None)
    tokens = fe.fit_transform(sf)

    return tokens['docs']

Пример #12

0

Показать файл

def trim_rare_words(sa,
                    threshold=2,
                    to_lower=True,
                    delimiters=["\r", "\v", "\n", "\f", "\t", " "],
                    stopwords=None):
    '''
    Remove words that occur below a certain number of times in an SArray.
    This is a common method of cleaning text before it is used, and can increase the
    quality and explainability of the models learned on the transformed data.

    RareWordTrimmer can be applied to all the string-, dictionary-, and list-typed
    columns in an SArray.

    * **string** : The string is first tokenized. By default, all letters are
      first converted to lower case, then tokenized by space characters. Each
      token is taken to be a word, and the words occuring below a threshold
      number of times across the entire column are removed, then the remaining
      tokens are concatenated back into a string.

    * **list** : Each element of the list must be a string, where each element
      is assumed to be a token. The remaining tokens are then filtered
      by count occurences and a threshold value.

    * **dict** : The method first obtains the list of keys in the dictionary.
      This list is then processed as a standard list, except the value of each
      key must be of integer type and is considered to be the count of that key.


    Parameters
    ----------
    sa: SArray[str | dict | list]
        The input text data.

    threshold : int, optional
        The count below which words are removed from the input.

    stopwords: list[str], optional
        A manually specified list of stopwords, which are removed regardless
        of count.

    to_lower : bool, optional
        Indicates whether to map the input strings to lower case before counting.

    delimiters: list[string], optional
        A list of delimiter characters for tokenization. By default, the list
        is defined to be the list of space characters. The user can define
        any custom list of single-character delimiters. Alternatively, setting
        `delimiters=None` will use a Penn treebank type tokenization, which
        is better at handling punctuations. (See reference below for details.)

    Returns
    -------
    out : SArray.
        An SArray with words below a threshold removed.

    See Also
    --------
    count_ngrams, tf_idf, tokenize,

    References
    ----------
    - `Penn treebank tokenization <https://web.archive.org/web/19970614072242/http://www.cis.upenn.edu:80/~treebank/tokenization.html>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import turicreate

        # Create input data
        >>> sa = turicreate.SArray(["The quick brown fox jumps in a fox like way.",
        ...                       "Word word WORD, word!!!word"])

        # Run trim_rare_words
        >>> turicreate.text_analytics.trim_rare_words(sa)
        dtype: str
        Rows: 2
        ['fox fox', 'word word']

        # Run trim_rare_words with Penn treebank style tokenization to handle
        # puntuations
        >>> turicreate.text_analytics.trim_rare_words(sa, delimiters=None)
        dtype: str
        Rows: 2
        ['fox fox', 'word word word']

        # Run trim_rare_words with dictionary input
        >>> sa = turicreate.SArray([{'alice bob': 1, 'Bob alice': 2},
        ...                       {'a dog': 0, 'a dog cat': 5}])
        >>> turicreate.text_analytics.trim_rare_words(sa)
        dtype: dict
        Rows: 2
        [{'bob alice': 2}, {'a dog cat': 5}]

        # Run trim_rare_words with list input
        >>> sa = turicreate.SArray([['one', 'bar bah', 'One'],
        ...                     ['a dog', 'a dog cat', 'A DOG']])
        >>> turicreate.text_analytics.trim_rare_words(sa)
        dtype: list
        Rows: 2
        [['one', 'one'], ['a dog', 'a dog']]


'''

    _raise_error_if_not_sarray(sa, "sa")

    ## Compute word counts
    sf = _turicreate.SFrame({'docs': sa})
    fe = _feature_engineering.RareWordTrimmer(features='docs',
                                              threshold=threshold,
                                              to_lower=to_lower,
                                              delimiters=delimiters,
                                              stopwords=stopwords,
                                              output_column_prefix=None)
    tokens = fe.fit_transform(sf)

    return tokens['docs']

Пример #13

0

Показать файл

def count_words(sa,
                to_lower=True,
                delimiters=["\r", "\v", "\n", "\f", "\t", " "]):
    """
    count_words(sa, to_lower=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "])

    Convert the content of string/dict/list type SArrays to a dictionary of
    (word, count) pairs. Dictionary keys and list elements must be strings.
    The strings are first tokenized into words according to the specified
    `to_lower` and `delimiters` options. Then, word counts are accumulated.
    In each output dictionary, the keys are the words in the corresponding
    input data entry, and the values are the number of times the words appears.
    By default, words are split on all whitespace and newline characters. The
    output is commonly known as the "bag-of-words" representation of text data.

    This function is implemented using

    Parameters
    ----------
    sa : SArray[str | dict | list]
        Input data to be tokenized and counted. 

    to_lower : bool, optional
        If True, all strings are converted to lower case before counting.

    delimiters : list[str], None, optional
        Input strings are tokenized using delimiter characters in this list.
        Each entry in this list must contain a single character. If set to
        `None`, then a Penn treebank-style tokenization is used, which contains
        smart handling of punctuations.

    Returns
    -------
    out : SArray[dict]
        Each entry contains a dictionary with the frequency count of each word
        in the corresponding input entry.

    See Also
    --------
    count_ngrams, tf_idf, tokenize,

    References
    ----------
    - `Bag of words model <http://en.wikipedia.org/wiki/Bag-of-words_model>`_
    - `Penn treebank tokenization <https://web.archive.org/web/19970614072242/http://www.cis.upenn.edu:80/~treebank/tokenization.html>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import turicreate

        # Create input data
        >>> sa = turicreate.SArray(["The quick brown fox jumps.",
        ...                       "Word word WORD, word!!!word"])

        # Run count_words
        >>> turicreate.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'quick': 1, 'brown': 1, 'the': 1, 'fox': 1, 'jumps.': 1},
         {'word,': 1, 'word!!!word': 1, 'word': 2}]

        # Run count_words with Penn treebank style tokenization to handle
        # puntuations
        >>> turicreate.text_analytics.count_words(sa, delimiters=None)
        dtype: dict
        Rows: 2
        [{'brown': 1, 'jumps': 1, 'fox': 1, '.': 1, 'quick': 1, 'the': 1},
         {'word': 3, 'word!!!word': 1, ',': 1}]

        # Run count_words with dictionary input
        >>> sa = turicreate.SArray([{'alice bob': 1, 'Bob alice': 0.5},
        ...                       {'a dog': 0, 'a dog cat': 5}])
        >>> turicreate.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'bob': 1.5, 'alice': 1.5}, {'a': 5, 'dog': 5, 'cat': 5}]

        # Run count_words with list input
        >>> sa = turicreate.SArray([['one', 'bar bah'], ['a dog', 'a dog cat']])
        >>> turicreate.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'bar': 1, 'bah': 1, 'one': 1}, {'a': 2, 'dog': 2, 'cat': 1}]

    """

    _raise_error_if_not_sarray(sa, "sa")

    ## Compute word counts
    sf = _turicreate.SFrame({'docs': sa})
    fe = _feature_engineering.WordCounter(features='docs',
                                          to_lower=to_lower,
                                          delimiters=delimiters,
                                          output_column_prefix=None)
    output_sf = fe.fit_transform(sf)

    return output_sf['docs']

Пример #14

0

Показать файл

def count_ngrams(sa,
                 n=2,
                 method="word",
                 to_lower=True,
                 delimiters=[
                     "\r", "\v", "\n", "\f", "\t", " ", "!", "#", "$", "%",
                     "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":",
                     ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_",
                     "`", "{", "|", "}", "~"
                 ],
                 ignore_punct=True,
                 ignore_space=True):
    """
    count_ngrams(sa, to_lower=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " ", "!", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\\\", "]", "^", "_", "`", "{", "|", "}", "~"], ignore_punct=True, ignore_space=True)

    Return an SArray of ``dict`` type where each element contains the count
    for each of the n-grams that appear in the corresponding input element.
    The n-grams can be specified to be either character n-grams or word
    n-grams.  The input SArray could contain strings, dicts with string keys
    and numeric values, or lists of strings.

    This function is implemented using

    Parameters
    ----------
    sa : SArray[str | dict | list]
        Input text data. 

    n : int, optional
        The number of words in each n-gram. An ``n`` value of 1 returns word
        counts.

    method : {'word', 'character'}, optional
        If "word", the function performs a count of word n-grams. If
        "character", does a character n-gram count.

    to_lower : bool, optional
        If True, all words are converted to lower case before counting.

    delimiters : list[str], None, optional
        If method is "word", input strings are tokenized using delimiter
        characters in this list. Each entry in this list must contain a single
        character. If set to `None`, then a Penn treebank-style tokenization is
        used, which contains smart handling of punctuations. If method is
        "character," this option is ignored.

    ignore_punct : bool, optional
        If method is "character", indicates if *punctuations* between words are
        counted as part of the n-gram. For instance, with the input SArray
        element of "fun.games", if this parameter is set to False one
        tri-gram would be 'n.g'. If ``ignore_punct`` is set to True, there
        would be no such tri-gram (there would still be 'nga'). This
        parameter has no effect if the method is set to "word".

    ignore_space : bool, optional
        If method is "character", indicates if *spaces* between words are
        counted as part of the n-gram. For instance, with the input SArray
        element of "fun games", if this parameter is set to False one
        tri-gram would be 'n g'. If ``ignore_space`` is set to True, there
        would be no such tri-gram (there would still be 'nga'). This
        parameter has no effect if the method is set to "word".

    Returns
    -------
    out : SArray[dict]
      An SArray of dictionary type, where each key is the n-gram string
      and each value is its count.

    See Also
    --------
    count_words, tokenize,

    Notes
    -----
    - Ignoring case (with ``to_lower``) involves a full string copy of the
      SArray data. To increase speed for large documents, set ``to_lower`` to
      False.

    - Punctuation and spaces are both delimiters by default when counting
      word n-grams. When counting character n-grams, one may choose to ignore
      puncutations, spaces, neither, or both.

    References
    ----------
    - `N-gram wikipedia article <http://en.wikipedia.org/wiki/N-gram>`_
    - `Penn treebank tokenization <https://web.archive.org/web/19970614072242/http://www.cis.upenn.edu:80/~treebank/tokenization.html>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import turicreate

        # Counting word n-grams:
        >>> sa = turicreate.SArray(['I like big dogs. I LIKE BIG DOGS.'])
        >>> turicreate.text_analytics.count_ngrams(sa, 3)
        dtype: dict
        Rows: 1
        [{'big dogs i': 1, 'like big dogs': 2, 'dogs i like': 1, 'i like big': 2}]

        # Counting character n-grams:
        >>> sa = turicreate.SArray(['Fun. Is. Fun'])
        >>> turicreate.text_analytics.count_ngrams(sa, 3, "character")
        dtype: dict
        Rows: 1
        {'fun': 2, 'nis': 1, 'sfu': 1, 'isf': 1, 'uni': 1}]

        # Run count_ngrams with dictionary input
        >>> sa = turicreate.SArray([{'alice bob': 1, 'Bob alice': 0.5},
        ...                       {'a dog': 0, 'a dog cat': 5}])
        >>> turicreate.text_analytics.count_ngrams(sa)
        dtype: dict
        Rows: 2
        [{'bob alice': 0.5, 'alice bob': 1}, {'dog cat': 5, 'a dog': 5}]

        # Run count_ngrams with list input
        >>> sa = turicreate.SArray([['one', 'bar bah'], ['a dog', 'a dog cat']])
        >>> turicreate.text_analytics.count_ngrams(sa)
        dtype: dict
        Rows: 2
        [{'bar bah': 1}, {'dog cat': 1, 'a dog': 2}]
    """

    _raise_error_if_not_sarray(sa, "sa")

    ## Compute word counts
    sf = _turicreate.SFrame({'docs': sa})
    fe = _feature_engineering.NGramCounter(features='docs',
                                           n=n,
                                           method=method,
                                           to_lower=to_lower,
                                           delimiters=delimiters,
                                           ignore_punct=ignore_punct,
                                           ignore_space=ignore_space,
                                           output_column_prefix=None)
    output_sf = fe.fit_transform(sf)

    return output_sf['docs']

Пример #15

0

Показать файл

Файл: _util.py Проект: chrinide/turicreate

def count_words(text, to_lower=True, delimiters=DEFAULT_DELIMITERS):
    """
    If `text` is an SArray of strings or an SArray of lists of strings, the
    occurances of word are counted for each row in the SArray.

    If `text` is an SArray of dictionaries, the keys are tokenized and the
    values are the counts. Counts for the same word, in the same row, are
    added together.

    This output is commonly known as the "bag-of-words" representation of text
    data.

    Parameters
    ----------
    text : SArray[str | dict | list]
        SArray of type: string, dict or list.

    to_lower : bool, optional
        If True, all strings are converted to lower case before counting.

    delimiters : list[str], None, optional
        Input strings are tokenized using `delimiters` characters in this list.
        Each entry in this list must contain a single character. If set to
        `None`, then a Penn treebank-style tokenization is used, which contains
        smart handling of punctuations.

    Returns
    -------
    out : SArray[dict]
        An SArray with the same length as the`text` input. For each row, the keys
        of the dictionary are the words and the values are the corresponding counts.

    See Also
    --------
    count_ngrams, tf_idf, tokenize,

    References
    ----------
    - `Bag of words model <http://en.wikipedia.org/wiki/Bag-of-words_model>`_
    - `Penn treebank tokenization <https://web.archive.org/web/19970614072242/http://www.cis.upenn.edu:80/~treebank/tokenization.html>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import turicreate

        # Create input data
        >>> sa = turicreate.SArray(["The quick brown fox jumps.",
                                    "Word word WORD, word!!!word"])

        # Run count_words
        >>> turicreate.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'quick': 1, 'brown': 1, 'the': 1, 'fox': 1, 'jumps.': 1},
         {'word,': 5}]

        # Run count_words with Penn treebank style tokenization to handle
        # punctuations
        >>> turicreate.text_analytics.count_words(sa, delimiters=None)
        dtype: dict
        Rows: 2
        [{'brown': 1, 'jumps': 1, 'fox': 1, '.': 1, 'quick': 1, 'the': 1},
         {'word': 3, 'word!!!word': 1, ',': 1}]

        # Run count_words with dictionary input
        >>> sa = turicreate.SArray([{'alice bob': 1, 'Bob alice': 0.5},
                                    {'a dog': 0, 'a dog cat': 5}])
        >>> turicreate.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'bob': 1.5, 'alice': 1.5}, {'a': 5, 'dog': 5, 'cat': 5}]

        # Run count_words with list input
        >>> sa = turicreate.SArray([['one', 'bar bah'], ['a dog', 'a dog cat']])
        >>> turicreate.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'bar': 1, 'bah': 1, 'one': 1}, {'a': 2, 'dog': 2, 'cat': 1}]

    """

    _raise_error_if_not_sarray(text, "text")

    ## Compute word counts
    sf = _turicreate.SFrame({"docs": text})
    fe = _feature_engineering.WordCounter(
        features="docs",
        to_lower=to_lower,
        delimiters=delimiters,
        output_column_prefix=None,
    )
    output_sf = fe.fit_transform(sf)

    return output_sf["docs"]

Пример #16

0

Показать файл

 def test_predict(self):
     model = self.model
     for output_type in ['class', 'probability_vector']:
         preds = model.predict(self.sf.head(), output_type=output_type)
         _raise_error_if_not_sarray(preds)
         self.assertEqual(len(preds), len(self.sf.head()))

Python _raise_error_if_not_sarray примеры использования