def tf_idf(dataset): """ Compute the TF-IDF scores for each word in each document. The collection of documents must be in bag-of-words format. .. math:: \mbox{TF-IDF}(w, d) = tf(w, d) * log(N / f(w)) where :math:`tf(w, d)` is the number of times word :math:`w` appeared in document :math:`d`, :math:`f(w)` is the number of documents word :math:`w` appeared in, :math:`N` is the number of documents, and we use the natural logarithm. This function is implemented using :py:class:`~graphlab.toolkits.feature_engineering._tf_idf.TFIDF`. Parameters ---------- dataset : SArray[str | dict | list] Input text data. See :py:class:`~graphlab.toolkits.feature_engineering._tf_idf.TFIDF` documentation for details on how string, dict, and list inputs are handled. Returns ------- out : SArray[dict] The same document corpus where each score has been replaced by the TF-IDF transformation. See Also -------- count_words, count_ngrams, tokenize, graphlab.toolkits.feature_engineering._tfidf.TFIDF References ---------- - `Wikipedia - TF-IDF <https://en.wikipedia.org/wiki/TFIDF>`_ Examples -------- .. sourcecode:: python >>> import graphlab >>> docs = graphlab.SArray('https://static.turi.com/datasets/nips-text') >>> docs_tfidf = graphlab.text_analytics.tf_idf(docs) """ _mt._get_metric_tracker().track('toolkit.text_analytics.tf_idf') _raise_error_if_not_sarray(dataset, "dataset") if len(dataset) == 0: return _graphlab.SArray() dataset = _graphlab.SFrame({'docs': dataset}) scores = _graphlab.feature_engineering.TFIDF('docs').fit_transform(dataset) return scores['docs']
def tf_idf(dataset): """ Compute the TF-IDF scores for each word in each document. The collection of documents must be in bag-of-words format. .. math:: \mbox{TF-IDF}(w, d) = tf(w, d) * log(N / f(w)) where :math:`tf(w, d)` is the number of times word :math:`w` appeared in document :math:`d`, :math:`f(w)` is the number of documents word :math:`w` appeared in, :math:`N` is the number of documents, and we use the natural logarithm. This function is implemented using :py:class:`~graphlab.toolkits.feature_engineering._tf_idf.TFIDF`. Parameters ---------- dataset : SArray[str | dict | list] Input text data. See :py:class:`~graphlab.toolkits.feature_engineering._tf_idf.TFIDF` documentation for details on how string, dict, and list inputs are handled. Returns ------- out : SArray[dict] The same document corpus where each score has been replaced by the TF-IDF transformation. See Also -------- count_words, count_ngrams, tokenize, graphlab.toolkits.feature_engineering._tfidf.TFIDF References ---------- - `Wikipedia - TF-IDF <https://en.wikipedia.org/wiki/TFIDF>`_ Examples -------- .. sourcecode:: python >>> import graphlab >>> docs = graphlab.SArray('http://s3.amazonaws.com/dato-datasets/nips-text') >>> docs_tfidf = graphlab.text_analytics.tf_idf(docs) """ _mt._get_metric_tracker().track('toolkit.text_analytics.tf_idf') _raise_error_if_not_sarray(dataset, "dataset") if len(dataset) == 0: return _graphlab.SArray() dataset = _graphlab.SFrame({'docs': dataset}) scores = _graphlab.feature_engineering.TFIDF('docs').fit_transform(dataset) return scores['docs']
def _supervised_evaluation_error_checking(targets, predictions): """ Perform basic error checking for the evaluation metrics. Check types and sizes of the inputs. """ _raise_error_if_not_sarray(targets, "targets") _raise_error_if_not_sarray(predictions, "predictions") if (targets.size() != predictions.size()): raise _ToolkitError( "Input SArrays 'targets' and 'predictions' must be of the same length.")
def _supervised_evaluation_error_checking(targets, predictions): """ Perform basic error checking for the evaluation metrics. Check types and sizes of the inputs. """ _raise_error_if_not_sarray(targets, "targets") _raise_error_if_not_sarray(predictions, "predictions") if (targets.size() != predictions.size()): raise _ToolkitError( "Input SArrays 'targets' and 'predictions' must be of the same length.")
def split_by_sentence(sa): """ The SentenceSplitter takes SArrays of type string or list, and returns an SArray of type list of strings, where each element is a single sentence. If the input SArroy is of type list, each element is either a list or sring. The list is flattened and concatenated, and then is split by sentence. This function is implemented using :py:class:`~graphlab.toolkits.feature_engineering._sentence_splitter.SentenceSplitter`. Parameters ---------- sa : SArray[str] Input data to be split by sentence. Returns ------- out : SArray[list] Each element of the list is a sentence. See Also -------- count_ngrams, tf_idf, tokenize, graphlab.toolkits.feature_engineering._sentence_splitter.SentenceSplitter Examples -------- .. sourcecode:: python >>> import graphlab # Create input data >>> sa = graphlab.SArray(["The quick brown fox jumps.The slow brown fox" + " crawls"]) # Run split_by_sentence >>> graphlab.text_analytics.split_by_sentence(sa) dtype: list Rows: 1 [['The quick brown fox jumps.', 'The slow brown fox crawls']] # Input SArray of type list >>> sa = graphlab.SArray([["The quick brown fox jumps.", "The slow brown fox" + " crawls"]]) # Run split_by_sentence >>> graphlab.text_analytics.split_by_sentence(sa) dtype: list Rows: 1 [['The quick brown fox jumps.', 'The slow brown fox crawls']] """ _mt._get_metric_tracker().track('toolkit.text_analytics.split_by_sentence') _raise_error_if_not_sarray(sa, "sa") ## Compute word counts sf = _graphlab.SFrame({'docs': sa}) fe = _graphlab.feature_engineering.SentenceSplitter(features='docs', output_column_prefix=None, verbose=False) output_sf = fe.fit_transform(sf) return output_sf['docs']
def tokenize(sa, to_lower=False, delimiters=["\r", "\v", "\n", "\f", "\t", " "]): """ tokenize(sa, to_lower=False, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "]) Tokenize the input SArray of text strings and return the list of tokens. This function is implemented using :py:class:`~graphlab.toolkits.feature_engineering._tokenizer.Tokenizer`. Please refer to the Tokenizer documentation for details about how tokenization is done. Parameters ---------- sa : SArray[str] Input data of strings representing English text. This tokenizer is not intended to process XML, HTML, or other structured text formats. to_lower : bool, optional If True, all strings are converted to lower case before tokenization. delimiters : list[str], None, optional Input strings are tokenized using delimiter characters in this list. Each entry in this list must contain a single character. If set to `None`, then a Penn treebank-style tokenization is used, which contains smart handling of punctuations. Returns ------- out : SArray[list] Each text string in the input is mapped to a list of tokens. See Also -------- count_words, count_ngrams, tf_idf, graphlab.toolkits.feature_engineering._tokenizer.Tokenizer References ---------- - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_ Examples -------- .. sourcecode:: python >>> import graphlab >>> docs = graphlab.SArray(['This is the first sentence.', ... 'This one, it\'s the second sentence.']) # Default tokenization by space characters >>> graphlab.text_analytics.tokenize(docs) dtype: list Rows: 2 [['This', 'is', 'the', 'first', 'sentence.'], ['This', 'one,', "it's", 'the', 'second', 'sentence.']] # Penn treebank-style tokenization >>> graphlab.text_analytics.tokenize(docs, delimiters=None) dtype: list Rows: 2 [['This', 'is', 'the', 'first', 'sentence', '.'], ['This', 'one', ',', 'it', "'s", 'the', 'second', 'sentence', '.']] """ _mt._get_metric_tracker().track('toolkit.text_analytics.tokenize') _raise_error_if_not_sarray(sa, "sa") ## Compute word counts sf = _graphlab.SFrame({'docs': sa}) fe = _graphlab.feature_engineering.Tokenizer(features='docs', to_lower=to_lower, delimiters=delimiters, output_column_prefix=None) tokens = fe.fit_transform(sf) return tokens['docs']
def trim_rare_words(sa, threshold=2, to_lower=True, delimiters=["\r", "\v", "\n", "\f", "\t", " "], stopwords=None): ''' Remove words that occur below a certain number of times in an SArray. This is a common method of cleaning text before it is used, and can increase the quality and explainability of the models learned on the transformed data. RareWordTrimmer can be applied to all the string-, dictionary-, and list-typed columns in an SArray. * **string** : The string is first tokenized. By default, all letters are first converted to lower case, then tokenized by space characters. Each token is taken to be a word, and the words occuring below a threshold number of times across the entire column are removed, then the remaining tokens are concatenated back into a string. * **list** : Each element of the list must be a string, where each element is assumed to be a token. The remaining tokens are then filtered by count occurences and a threshold value. * **dict** : The method first obtains the list of keys in the dictionary. This list is then processed as a standard list, except the value of each key must be of integer type and is considered to be the count of that key. This function is implemented using :py:class:`~graphlab.toolkits.feature_engineering._word_trimmer.RareWordTrimmer`. Parameters ---------- sa: SArray[str | dict | list] The input text data. threshold : int, optional The count below which words are removed from the input. stopwords: list[str], optional A manually specified list of stopwords, which are removed regardless of count. to_lower : bool, optional Indicates whether to map the input strings to lower case before counting. delimiters: list[string], optional A list of delimiter characters for tokenization. By default, the list is defined to be the list of space characters. The user can define any custom list of single-character delimiters. Alternatively, setting `delimiters=None` will use a Penn treebank type tokenization, which is better at handling punctuations. (See reference below for details.) Returns ------- out : SArray. An SArray with words below a threshold removed. See Also -------- count_ngrams, tf_idf, tokenize, graphlab.toolkits.feature_engineering._word_trimmer.RareWordTrimmer References ---------- - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_ Examples -------- .. sourcecode:: python >>> import graphlab # Create input data >>> sa = graphlab.SArray(["The quick brown fox jumps in a fox like way.", ... "Word word WORD, word!!!word"]) # Run trim_rare_words >>> graphlab.text_analytics.trim_rare_words(sa) dtype: str Rows: 2 ['fox fox', 'word word'] # Run trim_rare_words with Penn treebank style tokenization to handle # puntuations >>> graphlab.text_analytics.trim_rare_words(sa, delimiters=None) dtype: str Rows: 2 ['fox fox', 'word word word'] # Run trim_rare_words with dictionary input >>> sa = graphlab.SArray([{'alice bob': 1, 'Bob alice': 2}, ... {'a dog': 0, 'a dog cat': 5}]) >>> graphlab.text_analytics.trim_rare_words(sa) dtype: dict Rows: 2 [{'bob alice': 2}, {'a dog cat': 5}] # Run trim_rare_words with list input >>> sa = graphlab.SArray([['one', 'bar bah', 'One'], ... ['a dog', 'a dog cat', 'A DOG']]) >>> graphlab.text_analytics.trim_rare_words(sa) dtype: list Rows: 2 [['one', 'one'], ['a dog', 'a dog']] ''' _mt._get_metric_tracker().track('toolkit.text_analytics.trim_rare_words') _raise_error_if_not_sarray(sa, "sa") ## Compute word counts sf = _graphlab.SFrame({'docs': sa}) fe = _graphlab.feature_engineering.RareWordTrimmer(features='docs', threshold=threshold, to_lower=to_lower, delimiters=delimiters, stopwords=stopwords, output_column_prefix=None) tokens = fe.fit_transform(sf) return tokens['docs']
def count_ngrams(sa, n=2, method="word", to_lower=True, delimiters=["\r", "\v", "\n", "\f", "\t", " ", "!", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~"], ignore_punct=True, ignore_space=True): """ count_ngrams(sa, to_lower=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " ", "!", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\\\", "]", "^", "_", "`", "{", "|", "}", "~"], ignore_punct=True, ignore_space=True) Return an SArray of ``dict`` type where each element contains the count for each of the n-grams that appear in the corresponding input element. The n-grams can be specified to be either character n-grams or word n-grams. The input SArray could contain strings, dicts with string keys and numeric values, or lists of strings. This function is implemented using :py:class:`~graphlab.toolkits.feature_engineering._ngram_counter.NGramCounter`. Parameters ---------- sa : SArray[str | dict | list] Input text data. See :py:class:`~graphlab.toolkits.feature_engineering._ngram_counter.NGramCounter` documentation for details on how string, dict, and list inputs are handled. n : int, optional The number of words in each n-gram. An ``n`` value of 1 returns word counts. method : {'word', 'character'}, optional If "word", the function performs a count of word n-grams. If "character", does a character n-gram count. to_lower : bool, optional If True, all words are converted to lower case before counting. delimiters : list[str], None, optional If method is "word", input strings are tokenized using delimiter characters in this list. Each entry in this list must contain a single character. If set to `None`, then a Penn treebank-style tokenization is used, which contains smart handling of punctuations. If method is "character," this option is ignored. ignore_punct : bool, optional If method is "character", indicates if *punctuations* between words are counted as part of the n-gram. For instance, with the input SArray element of "fun.games", if this parameter is set to False one tri-gram would be 'n.g'. If ``ignore_punct`` is set to True, there would be no such tri-gram (there would still be 'nga'). This parameter has no effect if the method is set to "word". ignore_space : bool, optional If method is "character", indicates if *spaces* between words are counted as part of the n-gram. For instance, with the input SArray element of "fun games", if this parameter is set to False one tri-gram would be 'n g'. If ``ignore_space`` is set to True, there would be no such tri-gram (there would still be 'nga'). This parameter has no effect if the method is set to "word". Returns ------- out : SArray[dict] An SArray of dictionary type, where each key is the n-gram string and each value is its count. See Also -------- count_words, tokenize, graphlab.toolkits.feature_engineering._ngram_counter.NGramCounter Notes ----- - Ignoring case (with ``to_lower``) involves a full string copy of the SArray data. To increase speed for large documents, set ``to_lower`` to False. - Punctuation and spaces are both delimiters by default when counting word n-grams. When counting character n-grams, one may choose to ignore puncutations, spaces, neither, or both. References ---------- - `N-gram wikipedia article <http://en.wikipedia.org/wiki/N-gram>`_ - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_ Examples -------- .. sourcecode:: python >>> import graphlab # Counting word n-grams: >>> sa = graphlab.SArray(['I like big dogs. I LIKE BIG DOGS.']) >>> graphlab.text_analytics.count_ngrams(sa, 3) dtype: dict Rows: 1 [{'big dogs i': 1, 'like big dogs': 2, 'dogs i like': 1, 'i like big': 2}] # Counting character n-grams: >>> sa = graphlab.SArray(['Fun. Is. Fun']) >>> graphlab.text_analytics.count_ngrams(sa, 3, "character") dtype: dict Rows: 1 {'fun': 2, 'nis': 1, 'sfu': 1, 'isf': 1, 'uni': 1}] # Run count_ngrams with dictionary input >>> sa = graphlab.SArray([{'alice bob': 1, 'Bob alice': 0.5}, ... {'a dog': 0, 'a dog cat': 5}]) >>> graphlab.text_analytics.count_ngrams(sa) dtype: dict Rows: 2 [{'bob alice': 0.5, 'alice bob': 1}, {'dog cat': 5, 'a dog': 5}] # Run count_ngrams with list input >>> sa = graphlab.SArray([['one', 'bar bah'], ['a dog', 'a dog cat']]) >>> graphlab.text_analytics.count_ngrams(sa) dtype: dict Rows: 2 [{'bar bah': 1}, {'dog cat': 1, 'a dog': 2}] """ _mt._get_metric_tracker().track('toolkit.text_analytics.count_ngrams') _raise_error_if_not_sarray(sa, "sa") ## Compute word counts sf = _graphlab.SFrame({'docs': sa}) fe = _graphlab.feature_engineering.NGramCounter(features='docs', n=n, method=method, to_lower=to_lower, delimiters=delimiters, ignore_punct=ignore_punct, ignore_space=ignore_space, output_column_prefix=None) output_sf = fe.fit_transform(sf) return output_sf['docs']
def count_words(sa, to_lower=True, delimiters=["\r", "\v", "\n", "\f", "\t", " "]): """ count_words(sa, to_lower=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "]) Convert the content of string/dict/list type SArrays to a dictionary of (word, count) pairs. Dictionary keys and list elements must be strings. The strings are first tokenized into words according to the specified `to_lower` and `delimiters` options. Then, word counts are accumulated. In each output dictionary, the keys are the words in the corresponding input data entry, and the values are the number of times the words appears. By default, words are split on all whitespace and newline characters. The output is commonly known as the "bag-of-words" representation of text data. This function is implemented using :py:class:`~graphlab.toolkits.feature_engineering._word_counter.WordCounter`. Parameters ---------- sa : SArray[str | dict | list] Input data to be tokenized and counted. See :py:class:`~graphlab.toolkits.feature_engineering._word_counter.WordCounter` documentation for details on how string, dict, and list inputs are handled. to_lower : bool, optional If True, all strings are converted to lower case before counting. delimiters : list[str], None, optional Input strings are tokenized using delimiter characters in this list. Each entry in this list must contain a single character. If set to `None`, then a Penn treebank-style tokenization is used, which contains smart handling of punctuations. Returns ------- out : SArray[dict] Each entry contains a dictionary with the frequency count of each word in the corresponding input entry. See Also -------- count_ngrams, tf_idf, tokenize, graphlab.toolkits.feature_engineering._word_counter.WordCounter References ---------- - `Bag of words model <http://en.wikipedia.org/wiki/Bag-of-words_model>`_ - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_ Examples -------- .. sourcecode:: python >>> import graphlab # Create input data >>> sa = graphlab.SArray(["The quick brown fox jumps.", ... "Word word WORD, word!!!word"]) # Run count_words >>> graphlab.text_analytics.count_words(sa) dtype: dict Rows: 2 [{'quick': 1, 'brown': 1, 'the': 1, 'fox': 1, 'jumps.': 1}, {'word,': 1, 'word!!!word': 1, 'word': 2}] # Run count_words with Penn treebank style tokenization to handle # puntuations >>> graphlab.text_analytics.count_words(sa, delimiters=None) dtype: dict Rows: 2 [{'brown': 1, 'jumps': 1, 'fox': 1, '.': 1, 'quick': 1, 'the': 1}, {'word': 3, 'word!!!word': 1, ',': 1}] # Run count_words with dictionary input >>> sa = graphlab.SArray([{'alice bob': 1, 'Bob alice': 0.5}, ... {'a dog': 0, 'a dog cat': 5}]) >>> graphlab.text_analytics.count_words(sa) dtype: dict Rows: 2 [{'bob': 1.5, 'alice': 1.5}, {'a': 5, 'dog': 5, 'cat': 5}] # Run count_words with list input >>> sa = graphlab.SArray([['one', 'bar bah'], ['a dog', 'a dog cat']]) >>> graphlab.text_analytics.count_words(sa) dtype: dict Rows: 2 [{'bar': 1, 'bah': 1, 'one': 1}, {'a': 2, 'dog': 2, 'cat': 1}] """ _mt._get_metric_tracker().track('toolkit.text_analytics.count_words') _raise_error_if_not_sarray(sa, "sa") ## Compute word counts sf = _graphlab.SFrame({'docs': sa}) fe = _graphlab.feature_engineering.WordCounter(features='docs', to_lower=to_lower, delimiters=delimiters, output_column_prefix=None) output_sf = fe.fit_transform(sf) return output_sf['docs']
def extract_parts_of_speech(sa, chosen_pos=[PartOfSpeech.ADJ]): """ This function takes SArrays of type string or list, along with a list of parts of speech. If the input SArray is of type list, each element must be of type list or string. The output is of type dict, where each key is a part of speech, and the values are bags-of-words of that part of speech. and returns an SArray of type list where the elements are the words in the string which belong to that part of speech. This function is implemented using :py:class:`~graphlab.toolkits.feature_engineering._part_of_speech_extractor.PartOfSpeechExtractor`. Parameters ---------- sa : SArray[str] Input data to extract certain parts of speech from. chosen_pos: list[graphlab.text_analytics.PartOfSpeech], optional List of parts of speech enumerations as found in the graphlab.text_analytics.parts_of_speech namespace. The transformer will only select words of this part of speech. By default it selects adjectives. Returns ------- out : SArray[list] Each element of the list is a word belonging to the parts of speech described by chosen_pos. See Also -------- count_ngrams, tf_idf, tokenize, graphlab.toolkits.feature_engineering._part_of_speech_extractor.PartOfSpeechExtractor Examples -------- .. sourcecode:: python >>> import graphlab # Create input data >>> sa = graphlab.SArray(["The quick brown fox jumps.The slow brown fox" + " crawls"]) # Run extract_parts_of_speech >>> graphlab.text_analytics.extract_parts_of_speech(sa) dtype: dict Rows: 1 [{'ADJ': {'quick': 1, 'brown': 1, 'slow': 1}}] # List type input # Create input data >>> sa = graphlab.SArray([["The quick brown fox jumps.","The slow brown fox" + " crawls"]]) # Run extract_parts_of_speech >>> graphlab.text_analytics.extract_parts_of_speech(sa) dtype: dict Rows: 1 [{'ADJ': {'quick': 1, 'brown': 1, 'slow': 1}}] """ _mt._get_metric_tracker().track('toolkit.text_analytics.extract_parts_of_speech') _raise_error_if_not_sarray(sa, "sa") sf = _graphlab.SFrame({'docs': sa}) fe = _graphlab.feature_engineering.PartOfSpeechExtractor(features='docs', chosen_pos = chosen_pos, output_column_prefix=None, verbose=False) output_sf = fe.fit_transform(sf) return output_sf['docs']
def count_ngrams(sa, n=2, method="word", to_lower=True, ignore_space=True): """ Return an SArray of ``dict`` type where each element contains the count for each of the n-grams that appear in the corresponding input element. The n-grams can be specified to be either character n-grams or word n-grams. Parameters ---------- sa : SArray[str] Input text data. n : int, optional The number of words in each n-gram. An ``n`` value of 1 returns word counts. method : {'word', 'character'}, optional If "word", the function performs a count of word n-grams. If "character", does a character n-gram count. to_lower : bool, optional If True, all words are converted to lower case before counting. ignore_space : bool, optional If method is "character", indicates if *spaces* between words are counted as part of the n-gram. For instance, with the input SArray element of "fun games", if this parameter is set to False one tri-gram would be 'n g'. If ``ignore_space`` is set to True, there would be no such tri-gram (there would still be 'nga'). This parameter has no effect if the method is set to "word". Returns ------- out : SArray[dict] An SArray of dictionary type, where each key is the n-gram string and each value is its count. See Also -------- count_words, tokenize, graphlab.toolkits.feature_engineering._ngram_counter.NGramCounter Notes ----- - Ignoring case (with ``to_lower``) involves a full string copy of the SArray data. To increase speed for large documents, set ``to_lower`` to False. - Punctuation and spaces are both delimiters by default when counting word n-grams. When counting character n-grams, one may choose to ignore puncutations, spaces, neither, or both. References ---------- - `N-gram wikipedia article <http://en.wikipedia.org/wiki/N-gram>`_ Examples -------- .. sourcecode:: python >>> import graphlab # Counting word n-grams: >>> sa = graphlab.SArray(['I like big dogs. I LIKE BIG DOGS.']) >>> graphlab.text_analytics.count_ngrams(sa, 3) dtype: dict Rows: 1 [{'big dogs i': 1, 'like big dogs': 2, 'dogs i like': 1, 'i like big': 2}] # Counting character n-grams: >>> sa = graphlab.SArray(['Fun. Is. Fun']) >>> graphlab.text_analytics.count_ngrams(sa, 3, "character") dtype: dict Rows: 1 {'fun': 2, 'nis': 1, 'sfu': 1, 'isf': 1, 'uni': 1}] """ _mt._get_metric_tracker().track('toolkit.text_analytics.count_ngrams') _raise_error_if_not_sarray(sa, "sa") _raise_error_if_not_of_type(to_lower, [bool]) _raise_error_if_not_of_type(n, [int]) _raise_error_if_not_of_type(method, [str]) _raise_error_if_not_of_type(ignore_space, [bool]) if n < 1: raise ValueError("Input 'n' must be greater than 0") if method != "word" and method != "character": raise ValueError("Invalid 'method' input value. Please input " + "either 'word' or 'character' ") if n > 5 and method == 'word': warnings.warn("It is unusual for n-grams to be of size larger than 5.") return _graphlab.extensions._count_ngrams(sa, n, method, to_lower, ignore_space)
def tokenize(sa, to_lower=False, delimiters=["\r", "\v", "\n", "\f", "\t", " "]): """ tokenize(sa, to_lower=False, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "]) Tokenize the input SArray of text strings and return the list of tokens. This function is implemented using :py:class:`~graphlab.toolkits.feature_engineering._tokenizer.Tokenizer`. Please refer to the Tokenizer documentation for details about how tokenization is done. Parameters ---------- sa : SArray[str] Input data of strings representing English text. This tokenizer is not intended to process XML, HTML, or other structured text formats. to_lower : bool, optional If True, all strings are converted to lower case before tokenization. delimiters : list[str], None, optional Input strings are tokenized using delimiter characters in this list. Each entry in this list must contain a single character. If set to `None`, then a Penn treebank-style tokenization is used, which contains smart handling of punctuations. Returns ------- out : SArray[list] Each text string in the input is mapped to a list of tokens. See Also -------- count_words, count_ngrams, tf_idf, graphlab.toolkits.feature_engineering._tokenizer.Tokenizer References ---------- - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_ Examples -------- .. sourcecode:: python >>> import graphlab >>> docs = graphlab.SArray(['This is the first sentence.', ... 'This one, it\'s the second sentence.']) # Default tokenization by space characters >>> graphlab.text_analytics.tokenize(docs) dtype: list Rows: 2 [['This', 'is', 'the', 'first', 'sentence.'], ['This', 'one,', "it's", 'the', 'second', 'sentence.']] # Penn treebank-style tokenization >>> graphlab.text_analytics.tokenize(docs, delimiters=None) dtype: list Rows: 2 [['This', 'is', 'the', 'first', 'sentence', '.'], ['This', 'one', ',', 'it', "'s", 'the', 'second', 'sentence', '.']] """ _mt._get_metric_tracker().track('toolkit.text_analytics.tokenize') _raise_error_if_not_sarray(sa, "sa") ## Compute word counts sf = _graphlab.SFrame({'docs': sa}) fe = _graphlab.feature_engineering.Tokenizer(features='docs', to_lower=to_lower, delimiters=delimiters, output_column_prefix=None) tokens = fe.fit_transform(sf) return tokens['docs']
def count_words(sa, to_lower=True, delimiters=["\r", "\v", "\n", "\f", "\t", " "]): """ Convert the content of string type SArrays to a dictionary of (word, count) pairs. Dictionary keys and list elements must be strings. The strings are first tokenized into words according to the specified `to_lower` and `delimiters` options. Then, word counts are accumulated. In each output dictionary, the keys are the words in the corresponding input data entry, and the values are the number of times the words appears. By default, words are split on all whitespace and newline characters. The output is commonly known as the "bag-of-words" representation of text data. Parameters ---------- sa : SArray[str] Input data to be tokenized and counted. See :py:class:`~graphlab.toolkits.feature_engineering._word_counter.WordCounter` documentation for details on how string, dict, and list inputs are handled. to_lower : bool, optional If True, all strings are converted to lower case before counting. delimiters : list[str], None, optional Input strings are tokenized using delimiter characters in this list. Each entry in this list must contain a single character. If set to `None`, then a Penn treebank-style tokenization is used, which contains smart handling of punctuations. Returns ------- out : SArray[dict] Each entry contains a dictionary with the frequency count of each word in the corresponding input entry. See Also -------- count_ngrams, tf_idf, tokenize, graphlab.toolkits.feature_engineering._word_counter.WordCounter References ---------- - `Bag of words model <http://en.wikipedia.org/wiki/Bag-of-words_model>`_ Examples -------- .. sourcecode:: python >>> import graphlab # Create input data >>> sa = graphlab.SArray(["The quick brown fox jumps.", ... "Word word WORD, word!!!word"]) # Run count_words >>> graphlab.text_analytics.count_words(sa) dtype: dict Rows: 2 [{'quick': 1, 'brown': 1, 'the': 1, 'fox': 1, 'jumps.': 1}, {'word,': 1, 'word!!!word': 1, 'word': 2}] """ _mt._get_metric_tracker().track('toolkit.text_analytics.count_words') _raise_error_if_not_sarray(sa, "sa") return _graphlab.extensions._count_words(sa, to_lower, delimiters)