def _combine_mps_tasks(**tasks): # Concatenate output from all the completed tasks. models = [] evaluations = [] parameters = [] metadatas = [] status = {'Failed': 0, 'Completed': 0} for t in tasks.values(): if t is not None: # If an exception occurred, t is None models.append(t['model']) evaluations.append(t['evaluation']) parameters.append(t['parameters']) metadatas.append(t['metadata']) status['Completed'] += 1 else: status['Failed'] += 1 if all(m is None for m in models): models = None if all(x is None or len(x) == 0 for x in parameters): parameters = _SArray([None] * len(parameters), dtype=dict) evaluations = _SArray(evaluations, dtype=dict) parameters = _SArray(parameters, dtype=dict) metadatas = _SArray(metadatas, dtype=dict) summary = _SFrame({ 'metric': evaluations, 'metadata': metadatas, 'parameters': parameters }) return _OrderedDict([('models', models), ('summary', summary), ('status', status)])
def _combine_mps_tasks(**tasks): # Concatenate output from all the completed tasks. models = [] evaluations = [] parameters = [] metadatas = [] status = {'Failed': 0, 'Completed': 0} for t in tasks.values(): if t is not None: # If an exception occurred, t is None models.append(t['model']) evaluations.append(t['evaluation']) parameters.append(t['parameters']) metadatas.append(t['metadata']) status['Completed'] += 1 else: status['Failed'] += 1 if all(m is None for m in models): models = None if all(x is None or len(x) == 0 for x in parameters): parameters = _SArray([None] * len(parameters), dtype=dict) evaluations = _SArray(evaluations, dtype=dict) parameters = _SArray(parameters, dtype=dict) metadatas = _SArray(metadatas, dtype=dict) summary = _SFrame({'metric': evaluations, 'metadata': metadatas, 'parameters': parameters}) return _OrderedDict([('models', models), ('summary', summary), ('status', status)])
def _combiner(**tasks): """ Take the return values from each task, and return the combined result. The combined result is a tuple, where the first element is a list of models, and the second sframe is a summary sframe containing the searched parameters and the evaluation result. """ # Concatenate output from all the tasks. models = [] evaluations = [] parameters = [] metadatas = [] for t in tasks.values(): if t is not None: # If an exception occurred, t is None models.append(t['model']) evaluations.append(t['evaluation']) parameters.append(t['parameters']) metadatas.append(t['metadata']) if all(m is None for m in models): models = None # SFrame contains all the evaluation results, one row per model if all(type(x) in (int, float, str, list, type(None)) for x in evaluations): evaluation_sframe = _SFrame({'metric': evaluations}) else: evaluation_sframe = _SArray(evaluations).unpack( column_name_prefix=None) # SFrame contains all metadata, one row per model if all(type(x) in (int, float, str, list, type(None)) for x in metadatas): metadata_sframe = _SFrame({'metadata': metadatas}) else: metadata_sframe = _SArray(metadatas).unpack( column_name_prefix=None) # SFrame contains all the tuning parameters, one row per model if all(x is None or len(x) == 0 for x in parameters): parameter_sframe = _SFrame( {'parameters': [None] * len(parameters)}) else: parameter_sframe = _SArray(parameters).unpack( column_name_prefix=None) # Make a summary sframe concatenating horizontally the evalution_sframe # and paramter_sframe summary_sframe = _SFrame() param_columns = sorted(parameter_sframe.column_names()) metric_columns = sorted(evaluation_sframe.column_names()) metadata_columns = sorted(metadata_sframe.column_names()) summary_sframe[param_columns] = parameter_sframe[param_columns] summary_sframe[metric_columns] = evaluation_sframe[metric_columns] summary_sframe[metadata_columns] = metadata_sframe[metadata_columns] return _OrderedDict([('models', models), ('summary', summary_sframe)])
def _combine_sframes(summaries): summary = _SFrame() summary['metadata'] = _SArray(dtype=dict) summary['metric'] = _SArray(dtype=dict) summary['parameters'] = _SArray(dtype=dict) for s in summaries: summary = summary.append(s) return summary
def _raise_if_evaluator_return_is_not_packable(eval_result): if type(eval_result) in (int, float, str, list, _array): return try: _SArray([eval_result]).unpack(column_name_prefix=None) except: raise ValueError('Return of the evaluator must be a dict ' 'with simple types.')
def _combiner(**tasks): """ Take the return values from each task, and return the combined result. The combined result is a tuple, where the first element is a list of models, and the second sframe is a summary sframe containing the searched parameters and the evaluation result. """ # Concatenate output from all the tasks. models = [] evaluations = [] parameters = [] metadatas = [] for t in tasks.values(): if t is not None: # If an exception occurred, t is None models.append(t['model']) evaluations.append(t['evaluation']) parameters.append(t['parameters']) metadatas.append(t['metadata']) if all(m is None for m in models): models = None # SFrame contains all the evaluation results, one row per model if all( type(x) in (int, float, str, list, type(None)) for x in evaluations): evaluation_sframe = _SFrame({'metric': evaluations}) else: evaluation_sframe = _SArray(evaluations).unpack( column_name_prefix=None) # SFrame contains all metadata, one row per model if all(type(x) in (int, float, str, list, type(None)) for x in metadatas): metadata_sframe = _SFrame({'metadata': metadatas}) else: metadata_sframe = _SArray(metadatas).unpack(column_name_prefix=None) # SFrame contains all the tuning parameters, one row per model if all(x is None or len(x) == 0 for x in parameters): parameter_sframe = _SFrame({'parameters': [None] * len(parameters)}) else: parameter_sframe = _SArray(parameters).unpack(column_name_prefix=None) # Make a summary sframe concatenating horizontally the evalution_sframe # and paramter_sframe summary_sframe = _SFrame() param_columns = sorted(parameter_sframe.column_names()) metric_columns = sorted(evaluation_sframe.column_names()) metadata_columns = sorted(metadata_sframe.column_names()) summary_sframe[param_columns] = parameter_sframe[param_columns] summary_sframe[metric_columns] = evaluation_sframe[metric_columns] summary_sframe[metadata_columns] = metadata_sframe[metadata_columns] return _OrderedDict([('models', models), ('summary', summary_sframe)])
def count_words(sa, to_lower=True, delimiters=["\r", "\v", "\n", "\f", "\t", " "]): """ count_words(sa, to_lower=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "]) Count words in the SArray. Return an SArray of dictionary type. The keys in each output dictionary are the words in the corresponding input data entry, and the values are the number of times the words appears. By default, words are split on all whitespace and newline characters. The output is commonly known as the "bag-of-words" representation of text data. Parameters ---------- sa : SArray[str] Input data. to_lower : bool, optional If True, all words are converted to lower case before counting. delimiters : list[string], optional Input strings are delimited using characters in this list. Each entry in this list must contain a single character. Returns ------- out : SArray[dict] Each entry contains a dictionary with the frequency count of each word in the corresponding input entry. See Also -------- count_ngrams References ---------- - `Bag of words model <http://en.wikipedia.org/wiki/Bag-of-words_model>`_ Examples -------- >>> sa = graphlab.SArray(["The quick brown fox jumps.", ... "Word word WORD, word!!!word"]) >>> graphlab.text_analytics.count_words(sa) dtype: dict Rows: 2 [{'quick': 1, 'brown': 1, 'jumps': 1, 'fox': 1, 'the': 1}, {'word': 2, 'word,': 1, 'word!!!word': 1}] """ _mt._get_metric_tracker().track('toolkit.text_analytics.count_words') ## Validate inputs if (sa.dtype() != str): raise TypeError("Only string type SArrays are supported for counting words.") if (not all([len(delim) == 1 for delim in delimiters])): raise ValueError("Delimiters must be single-character strings.") ## Compute word counts options = {'to_lower': to_lower, 'delimiters': delimiters} return _SArray(_proxy=sa.__proxy__.count_bag_of_words(options))
def count_ngrams(sa, n=2, method="word", to_lower=True, ignore_space=True): """ Return an SArray of ``dict`` type where each element contains the count for each of the n-grams that appear in the corresponding input element. The n-grams can be specified to be either character n-grams or word n-grams. The input SArray must contain strings. Parameters ---------- sa : SArray[str] Input text data. n : int, optional The number of words in each n-gram. An ``n`` value of 1 returns word counts. method : {'word', 'character'}, optional If "word", the function performs a count of word n-grams. If "character", does a character n-gram count. to_lower : bool, optional If True, all words are converted to lower case before counting. ignore_space : bool, optional If method is "character", indicates if spaces between words are counted as part of the n-gram. For instance, with the input SArray element of "fun games", if this parameter is set to False one tri-gram would be 'n g'. If ``ignore_space`` is set to True, there would be no such tri-gram (there would still be 'nga'). This parameter has no effect if the method is set to "word". Returns ------- out : SArray An SArray of dictionary type, where each key is the n-gram string and each value is its count. See Also -------- count_words Notes ----- - Ignoring case (with ``to_lower``) involves a full string copy of the SArray data. To increase speed for large documents, set ``to_lower`` to False. - Punctuation and spaces are both delimiters when counting word n-grams. When counting character n-grams, punctuation is always ignored. References ---------- - `N-gram wikipedia article <http://en.wikipedia.org/wiki/N-gram>`_ Examples -------- Counting word n-grams: >>> sa = graphlab.SArray(['I like big dogs. I LIKE BIG DOGS.']) >>> graphlab.text_analytics.count_ngrams(sa, 3) dtype: dict Rows: 1 [{'big dogs i': 1, 'like big dogs': 2, 'dogs i like': 1, 'i like big': 2}] Counting character n-grams: >>> sa = graphlab.SArray(['Fun. Is. Fun']) >>> graphlab.text_analytics.count_ngrams(sa, 3, "character") dtype: dict Rows: 1 {'fun': 2, 'nis': 1, 'sfu': 1, 'isf': 1, 'uni': 1}] """ _mt._get_metric_tracker().track('toolkit.text_analytics.count_ngrams') ## Validate inputs if sa.dtype() != str: raise TypeError("Only string type SArrays are supported for counting n-grams.") if not isinstance(n, int): raise TypeError("Input 'n' must be of type int.") if n < 1: raise ValueError("Input 'n' must be greater than 0") if n > 5 and method == 'word': warnings.warn("It is unusual for n-grams to be of size larger than 5.") ## Compute n-gram counts options = {'to_lower': to_lower, 'ignore_space': ignore_space} if method == 'word': result = _SArray(_proxy=sa.__proxy__.count_ngrams(n, options)) elif method == 'character': result = _SArray(_proxy=sa.__proxy__.count_character_ngrams(n, options)) else: raise ValueError("Invalid 'method' input value. Please input " + "either 'word' or 'character' ") return result