def test_sarray(self): model = self.model sa = data[self.feature] predictions = model.predict(sa) _raise_error_if_not_sarray(predictions) predictions = model.predict_topk(sa, k=2) _raise_error_if_not_sframe(predictions) predictions = model.classify(sa) _raise_error_if_not_sframe(predictions)
def test_sarray(self): model = self.model data = self.sf[self.feature] predictions = model.predict(data) _raise_error_if_not_sarray(predictions) predictions = model.predict_topk(data) _raise_error_if_not_sframe(predictions) predictions = model.classify(data) _raise_error_if_not_sframe(predictions)
def test_predict(self): model = self.model for output_type in ["class", "probability_vector"]: preds = model.predict(data.head(), output_type=output_type) _raise_error_if_not_sarray(preds) self.assertEqual(len(preds), len(data.head())) if output_type == "class": self.assertTrue(all(preds[:5] == "white")) self.assertTrue(all(preds[5:] == "black"))
def test_predict(self): model = self.model for output_type in ['class', 'probability_vector']: preds = model.predict(data.head(), output_type=output_type) _raise_error_if_not_sarray(preds) self.assertEqual(len(preds), len(data.head())) if output_type == 'class': self.assertTrue(all(preds[:5] == 'white')) self.assertTrue(all(preds[5:] == 'black'))
def _supervised_evaluation_error_checking(targets, predictions): """ Perform basic error checking for the evaluation metrics. Check types and sizes of the inputs. """ _raise_error_if_not_sarray(targets, "targets") _raise_error_if_not_sarray(predictions, "predictions") if (len(targets) != len(predictions)): raise _ToolkitError( "Input SArrays 'targets' and 'predictions' must be of the same length.")
def test_predict(self): # default ('class') output_type predictions = self.model.predict(self.data['audio']) _raise_error_if_not_sarray(predictions) self.assertEqual(len(predictions), len(self.data)) for a, b in zip(predictions, self.data['labels']): self.assertEqual(a, b) # 'probability' output_type if self.is_binary_classification: predictions = self.model.predict(self.data['audio'], output_type='probability') _raise_error_if_not_sarray(predictions) self.assertEqual(len(predictions), len(self.data)) for probabilities, correct_label in zip(predictions, self.data['labels']): # correct value has highest probability? correct_index = self.model.classes.index(correct_label) self.assertEqual(np.argmax(probabilities), correct_index) # all probabilities sum close to 1? self.assertTrue(abs(np.sum(probabilities) - 1) < 0.00001) else: # 'probability' output type only supported for binary classification with self.assertRaises(ToolkitError): self.model.predict(self.data['audio'], output_type='probability') # 'probability_vector' output_type predictions = self.model.predict(self.data['audio'], output_type='probability_vector') _raise_error_if_not_sarray(predictions) self.assertEqual(len(predictions), len(self.data)) for prob_vector, correct_label in zip(predictions, self.data['labels']): # correct value has highest probability? correct_index = self.model.classes.index(correct_label) self.assertEqual(np.argmax(prob_vector), correct_index) # all probabilities sum close to 1? self.assertTrue(abs(np.sum(prob_vector) - 1) < 0.00001) # predict with single (dict) example single_prediction = self.model.predict(self.data['audio'][0]) _raise_error_if_not_sarray(single_prediction) self.assertEqual(len(single_prediction), 1) self.assertEqual(single_prediction[0], self.data['labels'][0]) # predict with SFrame data = self.data.copy() del data['labels'] predictions = self.model.predict(data) _raise_error_if_not_sarray(predictions) self.assertEqual(len(predictions), len(data)) for a, b in zip(predictions, self.data['labels']): self.assertEqual(a, b)
def stack_annotations(annotations_sarray): """ Converts object detection annotations (ground truth or predictions) to stacked format (an `SFrame` where each row is one object instance). Parameters ---------- annotations_sarray: SArray An `SArray` with unstacked predictions, exactly formatted as the annotations column when training an object detector or when making predictions. Returns ------- annotations_sframe: An `SFrame` with stacked annotations. See also -------- unstack_annotations Examples -------- Predictions are returned by the object detector in unstacked format: >>> predictions = detector.predict(images) By converting it to stacked format, it is easier to get an overview of object instances: >>> turicreate.object_detector.util.stack_annotations(predictions) Data: +--------+------------+-------+-------+-------+-------+--------+ | row_id | confidence | label | x | y | width | height | +--------+------------+-------+-------+-------+-------+--------+ | 0 | 0.98 | dog | 123.0 | 128.0 | 80.0 | 182.0 | | 0 | 0.67 | cat | 150.0 | 183.0 | 129.0 | 101.0 | | 1 | 0.8 | dog | 50.0 | 432.0 | 65.0 | 98.0 | +--------+------------+-------+-------+-------+-------+--------+ [3 rows x 7 columns] """ _raise_error_if_not_sarray(annotations_sarray, variable_name="annotations_sarray") sf = _tc.SFrame({ "annotations": annotations_sarray }).add_row_number("row_id") sf = sf.stack("annotations", new_column_name="annotations", drop_na=True) if len(sf) == 0: cols = ["row_id", "confidence", "label", "height", "width", "x", "y"] return _tc.SFrame({k: [] for k in cols}) sf = sf.unpack("annotations", column_name_prefix="") sf = sf.unpack("coordinates", column_name_prefix="") del sf["type"] return sf
def tf_idf(dataset): """ Compute the TF-IDF scores for each word in each document. The collection of documents must be in bag-of-words format. .. math:: \mbox{TF-IDF}(w, d) = tf(w, d) * log(N / f(w)) where :math:`tf(w, d)` is the number of times word :math:`w` appeared in document :math:`d`, :math:`f(w)` is the number of documents word :math:`w` appeared in, :math:`N` is the number of documents, and we use the natural logarithm. This function is implemented using Parameters ---------- dataset : SArray[str | dict | list] Input text data. Returns ------- out : SArray[dict] The same document corpus where each score has been replaced by the TF-IDF transformation. See Also -------- count_words, count_ngrams, tokenize, References ---------- - `Wikipedia - TF-IDF <https://en.wikipedia.org/wiki/TFIDF>`_ Examples -------- .. sourcecode:: python >>> import turicreate >>> docs = turicreate.SArray('https://static.turi.com/datasets/nips-text') >>> docs_tfidf = turicreate.text_analytics.tf_idf(docs) """ _raise_error_if_not_sarray(dataset, "dataset") if len(dataset) == 0: return _turicreate.SArray() dataset = _turicreate.SFrame({'docs': dataset}) scores = _feature_engineering.TFIDF('docs').fit_transform(dataset) return scores['docs']
def test_predict(self): sf = self.test.head() pred = self.model.predict(sf.head()) # Check the structure of the output _raise_error_if_not_sarray(pred) self.assertEqual(len(pred), len(sf)) # Make sure SFrame was not altered self.assertEqual([col for col in sf.column_names() if col.startswith("_")], []) # Predict should work on no input (and produce no predictions) pred0 = self.model.predict(sf[self.feature][:0]) self.assertEqual(len(pred0), 0)
def test_predict(self): sf = self.sf.head() # Make sure this does not need the annotations column to work del sf[self.annotations] pred = self.model.predict(sf.head()) # Check the structure of the output _raise_error_if_not_sarray(pred) self.assertEquals(len(pred), len(sf)) # Make sure SFrame was not altered self.assertEquals( [col for col in sf.column_names() if col.startswith('_')], []) # Predict should work on no input (and produce no predictions) pred0 = self.model.predict(sf[:0]) self.assertEquals(len(pred0), 0)
def tokenize(sa, to_lower=False, delimiters=["\r", "\v", "\n", "\f", "\t", " "]): """ tokenize(sa, to_lower=False, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "]) Tokenize the input SArray of text strings and return the list of tokens. Parameters ---------- sa : SArray[str] Input data of strings representing English text. This tokenizer is not intended to process XML, HTML, or other structured text formats. to_lower : bool, optional If True, all strings are converted to lower case before tokenization. delimiters : list[str], None, optional Input strings are tokenized using delimiter characters in this list. Each entry in this list must contain a single character. If set to `None`, then a Penn treebank-style tokenization is used, which contains smart handling of punctuations. Returns ------- out : SArray[list] Each text string in the input is mapped to a list of tokens. See Also -------- count_words, count_ngrams, tf_idf References ---------- - `Penn treebank tokenization <https://web.archive.org/web/19970614072242/http://www.cis.upenn.edu:80/~treebank/tokenization.html>`_ Examples -------- .. sourcecode:: python >>> import turicreate >>> docs = turicreate.SArray(['This is the first sentence.', ... 'This one, it\'s the second sentence.']) # Default tokenization by space characters >>> turicreate.text_analytics.tokenize(docs) dtype: list Rows: 2 [['This', 'is', 'the', 'first', 'sentence.'], ['This', 'one,', "it's", 'the', 'second', 'sentence.']] # Penn treebank-style tokenization >>> turicreate.text_analytics.tokenize(docs, delimiters=None) dtype: list Rows: 2 [['This', 'is', 'the', 'first', 'sentence', '.'], ['This', 'one', ',', 'it', "'s", 'the', 'second', 'sentence', '.']] """ _raise_error_if_not_sarray(sa, "sa") ## Compute word counts sf = _turicreate.SFrame({'docs': sa}) fe = _feature_engineering.Tokenizer(features='docs', to_lower=to_lower, delimiters=delimiters, output_column_prefix=None) tokens = fe.fit_transform(sf) return tokens['docs']
def trim_rare_words(sa, threshold=2, to_lower=True, delimiters=["\r", "\v", "\n", "\f", "\t", " "], stopwords=None): ''' Remove words that occur below a certain number of times in an SArray. This is a common method of cleaning text before it is used, and can increase the quality and explainability of the models learned on the transformed data. RareWordTrimmer can be applied to all the string-, dictionary-, and list-typed columns in an SArray. * **string** : The string is first tokenized. By default, all letters are first converted to lower case, then tokenized by space characters. Each token is taken to be a word, and the words occuring below a threshold number of times across the entire column are removed, then the remaining tokens are concatenated back into a string. * **list** : Each element of the list must be a string, where each element is assumed to be a token. The remaining tokens are then filtered by count occurences and a threshold value. * **dict** : The method first obtains the list of keys in the dictionary. This list is then processed as a standard list, except the value of each key must be of integer type and is considered to be the count of that key. Parameters ---------- sa: SArray[str | dict | list] The input text data. threshold : int, optional The count below which words are removed from the input. stopwords: list[str], optional A manually specified list of stopwords, which are removed regardless of count. to_lower : bool, optional Indicates whether to map the input strings to lower case before counting. delimiters: list[string], optional A list of delimiter characters for tokenization. By default, the list is defined to be the list of space characters. The user can define any custom list of single-character delimiters. Alternatively, setting `delimiters=None` will use a Penn treebank type tokenization, which is better at handling punctuations. (See reference below for details.) Returns ------- out : SArray. An SArray with words below a threshold removed. See Also -------- count_ngrams, tf_idf, tokenize, References ---------- - `Penn treebank tokenization <https://web.archive.org/web/19970614072242/http://www.cis.upenn.edu:80/~treebank/tokenization.html>`_ Examples -------- .. sourcecode:: python >>> import turicreate # Create input data >>> sa = turicreate.SArray(["The quick brown fox jumps in a fox like way.", ... "Word word WORD, word!!!word"]) # Run trim_rare_words >>> turicreate.text_analytics.trim_rare_words(sa) dtype: str Rows: 2 ['fox fox', 'word word'] # Run trim_rare_words with Penn treebank style tokenization to handle # puntuations >>> turicreate.text_analytics.trim_rare_words(sa, delimiters=None) dtype: str Rows: 2 ['fox fox', 'word word word'] # Run trim_rare_words with dictionary input >>> sa = turicreate.SArray([{'alice bob': 1, 'Bob alice': 2}, ... {'a dog': 0, 'a dog cat': 5}]) >>> turicreate.text_analytics.trim_rare_words(sa) dtype: dict Rows: 2 [{'bob alice': 2}, {'a dog cat': 5}] # Run trim_rare_words with list input >>> sa = turicreate.SArray([['one', 'bar bah', 'One'], ... ['a dog', 'a dog cat', 'A DOG']]) >>> turicreate.text_analytics.trim_rare_words(sa) dtype: list Rows: 2 [['one', 'one'], ['a dog', 'a dog']] ''' _raise_error_if_not_sarray(sa, "sa") ## Compute word counts sf = _turicreate.SFrame({'docs': sa}) fe = _feature_engineering.RareWordTrimmer(features='docs', threshold=threshold, to_lower=to_lower, delimiters=delimiters, stopwords=stopwords, output_column_prefix=None) tokens = fe.fit_transform(sf) return tokens['docs']
def count_words(sa, to_lower=True, delimiters=["\r", "\v", "\n", "\f", "\t", " "]): """ count_words(sa, to_lower=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "]) Convert the content of string/dict/list type SArrays to a dictionary of (word, count) pairs. Dictionary keys and list elements must be strings. The strings are first tokenized into words according to the specified `to_lower` and `delimiters` options. Then, word counts are accumulated. In each output dictionary, the keys are the words in the corresponding input data entry, and the values are the number of times the words appears. By default, words are split on all whitespace and newline characters. The output is commonly known as the "bag-of-words" representation of text data. This function is implemented using Parameters ---------- sa : SArray[str | dict | list] Input data to be tokenized and counted. to_lower : bool, optional If True, all strings are converted to lower case before counting. delimiters : list[str], None, optional Input strings are tokenized using delimiter characters in this list. Each entry in this list must contain a single character. If set to `None`, then a Penn treebank-style tokenization is used, which contains smart handling of punctuations. Returns ------- out : SArray[dict] Each entry contains a dictionary with the frequency count of each word in the corresponding input entry. See Also -------- count_ngrams, tf_idf, tokenize, References ---------- - `Bag of words model <http://en.wikipedia.org/wiki/Bag-of-words_model>`_ - `Penn treebank tokenization <https://web.archive.org/web/19970614072242/http://www.cis.upenn.edu:80/~treebank/tokenization.html>`_ Examples -------- .. sourcecode:: python >>> import turicreate # Create input data >>> sa = turicreate.SArray(["The quick brown fox jumps.", ... "Word word WORD, word!!!word"]) # Run count_words >>> turicreate.text_analytics.count_words(sa) dtype: dict Rows: 2 [{'quick': 1, 'brown': 1, 'the': 1, 'fox': 1, 'jumps.': 1}, {'word,': 1, 'word!!!word': 1, 'word': 2}] # Run count_words with Penn treebank style tokenization to handle # puntuations >>> turicreate.text_analytics.count_words(sa, delimiters=None) dtype: dict Rows: 2 [{'brown': 1, 'jumps': 1, 'fox': 1, '.': 1, 'quick': 1, 'the': 1}, {'word': 3, 'word!!!word': 1, ',': 1}] # Run count_words with dictionary input >>> sa = turicreate.SArray([{'alice bob': 1, 'Bob alice': 0.5}, ... {'a dog': 0, 'a dog cat': 5}]) >>> turicreate.text_analytics.count_words(sa) dtype: dict Rows: 2 [{'bob': 1.5, 'alice': 1.5}, {'a': 5, 'dog': 5, 'cat': 5}] # Run count_words with list input >>> sa = turicreate.SArray([['one', 'bar bah'], ['a dog', 'a dog cat']]) >>> turicreate.text_analytics.count_words(sa) dtype: dict Rows: 2 [{'bar': 1, 'bah': 1, 'one': 1}, {'a': 2, 'dog': 2, 'cat': 1}] """ _raise_error_if_not_sarray(sa, "sa") ## Compute word counts sf = _turicreate.SFrame({'docs': sa}) fe = _feature_engineering.WordCounter(features='docs', to_lower=to_lower, delimiters=delimiters, output_column_prefix=None) output_sf = fe.fit_transform(sf) return output_sf['docs']
def count_ngrams(sa, n=2, method="word", to_lower=True, delimiters=[ "\r", "\v", "\n", "\f", "\t", " ", "!", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~" ], ignore_punct=True, ignore_space=True): """ count_ngrams(sa, to_lower=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " ", "!", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\\\", "]", "^", "_", "`", "{", "|", "}", "~"], ignore_punct=True, ignore_space=True) Return an SArray of ``dict`` type where each element contains the count for each of the n-grams that appear in the corresponding input element. The n-grams can be specified to be either character n-grams or word n-grams. The input SArray could contain strings, dicts with string keys and numeric values, or lists of strings. This function is implemented using Parameters ---------- sa : SArray[str | dict | list] Input text data. n : int, optional The number of words in each n-gram. An ``n`` value of 1 returns word counts. method : {'word', 'character'}, optional If "word", the function performs a count of word n-grams. If "character", does a character n-gram count. to_lower : bool, optional If True, all words are converted to lower case before counting. delimiters : list[str], None, optional If method is "word", input strings are tokenized using delimiter characters in this list. Each entry in this list must contain a single character. If set to `None`, then a Penn treebank-style tokenization is used, which contains smart handling of punctuations. If method is "character," this option is ignored. ignore_punct : bool, optional If method is "character", indicates if *punctuations* between words are counted as part of the n-gram. For instance, with the input SArray element of "fun.games", if this parameter is set to False one tri-gram would be 'n.g'. If ``ignore_punct`` is set to True, there would be no such tri-gram (there would still be 'nga'). This parameter has no effect if the method is set to "word". ignore_space : bool, optional If method is "character", indicates if *spaces* between words are counted as part of the n-gram. For instance, with the input SArray element of "fun games", if this parameter is set to False one tri-gram would be 'n g'. If ``ignore_space`` is set to True, there would be no such tri-gram (there would still be 'nga'). This parameter has no effect if the method is set to "word". Returns ------- out : SArray[dict] An SArray of dictionary type, where each key is the n-gram string and each value is its count. See Also -------- count_words, tokenize, Notes ----- - Ignoring case (with ``to_lower``) involves a full string copy of the SArray data. To increase speed for large documents, set ``to_lower`` to False. - Punctuation and spaces are both delimiters by default when counting word n-grams. When counting character n-grams, one may choose to ignore puncutations, spaces, neither, or both. References ---------- - `N-gram wikipedia article <http://en.wikipedia.org/wiki/N-gram>`_ - `Penn treebank tokenization <https://web.archive.org/web/19970614072242/http://www.cis.upenn.edu:80/~treebank/tokenization.html>`_ Examples -------- .. sourcecode:: python >>> import turicreate # Counting word n-grams: >>> sa = turicreate.SArray(['I like big dogs. I LIKE BIG DOGS.']) >>> turicreate.text_analytics.count_ngrams(sa, 3) dtype: dict Rows: 1 [{'big dogs i': 1, 'like big dogs': 2, 'dogs i like': 1, 'i like big': 2}] # Counting character n-grams: >>> sa = turicreate.SArray(['Fun. Is. Fun']) >>> turicreate.text_analytics.count_ngrams(sa, 3, "character") dtype: dict Rows: 1 {'fun': 2, 'nis': 1, 'sfu': 1, 'isf': 1, 'uni': 1}] # Run count_ngrams with dictionary input >>> sa = turicreate.SArray([{'alice bob': 1, 'Bob alice': 0.5}, ... {'a dog': 0, 'a dog cat': 5}]) >>> turicreate.text_analytics.count_ngrams(sa) dtype: dict Rows: 2 [{'bob alice': 0.5, 'alice bob': 1}, {'dog cat': 5, 'a dog': 5}] # Run count_ngrams with list input >>> sa = turicreate.SArray([['one', 'bar bah'], ['a dog', 'a dog cat']]) >>> turicreate.text_analytics.count_ngrams(sa) dtype: dict Rows: 2 [{'bar bah': 1}, {'dog cat': 1, 'a dog': 2}] """ _raise_error_if_not_sarray(sa, "sa") ## Compute word counts sf = _turicreate.SFrame({'docs': sa}) fe = _feature_engineering.NGramCounter(features='docs', n=n, method=method, to_lower=to_lower, delimiters=delimiters, ignore_punct=ignore_punct, ignore_space=ignore_space, output_column_prefix=None) output_sf = fe.fit_transform(sf) return output_sf['docs']
def count_words(text, to_lower=True, delimiters=DEFAULT_DELIMITERS): """ If `text` is an SArray of strings or an SArray of lists of strings, the occurances of word are counted for each row in the SArray. If `text` is an SArray of dictionaries, the keys are tokenized and the values are the counts. Counts for the same word, in the same row, are added together. This output is commonly known as the "bag-of-words" representation of text data. Parameters ---------- text : SArray[str | dict | list] SArray of type: string, dict or list. to_lower : bool, optional If True, all strings are converted to lower case before counting. delimiters : list[str], None, optional Input strings are tokenized using `delimiters` characters in this list. Each entry in this list must contain a single character. If set to `None`, then a Penn treebank-style tokenization is used, which contains smart handling of punctuations. Returns ------- out : SArray[dict] An SArray with the same length as the`text` input. For each row, the keys of the dictionary are the words and the values are the corresponding counts. See Also -------- count_ngrams, tf_idf, tokenize, References ---------- - `Bag of words model <http://en.wikipedia.org/wiki/Bag-of-words_model>`_ - `Penn treebank tokenization <https://web.archive.org/web/19970614072242/http://www.cis.upenn.edu:80/~treebank/tokenization.html>`_ Examples -------- .. sourcecode:: python >>> import turicreate # Create input data >>> sa = turicreate.SArray(["The quick brown fox jumps.", "Word word WORD, word!!!word"]) # Run count_words >>> turicreate.text_analytics.count_words(sa) dtype: dict Rows: 2 [{'quick': 1, 'brown': 1, 'the': 1, 'fox': 1, 'jumps.': 1}, {'word,': 5}] # Run count_words with Penn treebank style tokenization to handle # punctuations >>> turicreate.text_analytics.count_words(sa, delimiters=None) dtype: dict Rows: 2 [{'brown': 1, 'jumps': 1, 'fox': 1, '.': 1, 'quick': 1, 'the': 1}, {'word': 3, 'word!!!word': 1, ',': 1}] # Run count_words with dictionary input >>> sa = turicreate.SArray([{'alice bob': 1, 'Bob alice': 0.5}, {'a dog': 0, 'a dog cat': 5}]) >>> turicreate.text_analytics.count_words(sa) dtype: dict Rows: 2 [{'bob': 1.5, 'alice': 1.5}, {'a': 5, 'dog': 5, 'cat': 5}] # Run count_words with list input >>> sa = turicreate.SArray([['one', 'bar bah'], ['a dog', 'a dog cat']]) >>> turicreate.text_analytics.count_words(sa) dtype: dict Rows: 2 [{'bar': 1, 'bah': 1, 'one': 1}, {'a': 2, 'dog': 2, 'cat': 1}] """ _raise_error_if_not_sarray(text, "text") ## Compute word counts sf = _turicreate.SFrame({"docs": text}) fe = _feature_engineering.WordCounter( features="docs", to_lower=to_lower, delimiters=delimiters, output_column_prefix=None, ) output_sf = fe.fit_transform(sf) return output_sf["docs"]
def test_predict(self): model = self.model for output_type in ['class', 'probability_vector']: preds = model.predict(self.sf.head(), output_type=output_type) _raise_error_if_not_sarray(preds) self.assertEqual(len(preds), len(self.sf.head()))