示例#1
0
def _combine_mps_tasks(**tasks):
    # Concatenate output from all the completed tasks.
    models = []
    evaluations = []
    parameters = []
    metadatas = []
    status = {'Failed': 0, 'Completed': 0}
    for t in tasks.values():
        if t is not None:  # If an exception occurred, t is None
            models.append(t['model'])
            evaluations.append(t['evaluation'])
            parameters.append(t['parameters'])
            metadatas.append(t['metadata'])
            status['Completed'] += 1
        else:
            status['Failed'] += 1

    if all(m is None for m in models):
        models = None
    if all(x is None or len(x) == 0 for x in parameters):
        parameters = _SArray([None] * len(parameters), dtype=dict)
    evaluations = _SArray(evaluations, dtype=dict)
    parameters = _SArray(parameters, dtype=dict)
    metadatas = _SArray(metadatas, dtype=dict)

    summary = _SFrame({
        'metric': evaluations,
        'metadata': metadatas,
        'parameters': parameters
    })

    return _OrderedDict([('models', models), ('summary', summary),
                         ('status', status)])
def _combine_mps_tasks(**tasks):
    # Concatenate output from all the completed tasks.
    models = []
    evaluations = []
    parameters = []
    metadatas = []
    status = {'Failed': 0, 'Completed': 0}
    for t in tasks.values():
        if t is not None:  # If an exception occurred, t is None
            models.append(t['model'])
            evaluations.append(t['evaluation'])
            parameters.append(t['parameters'])
            metadatas.append(t['metadata'])
            status['Completed'] += 1
        else:
            status['Failed'] += 1

    if all(m is None for m in models):
        models = None
    if all(x is None or len(x) == 0 for x in parameters):
        parameters = _SArray([None] * len(parameters), dtype=dict)
    evaluations = _SArray(evaluations, dtype=dict)
    parameters = _SArray(parameters, dtype=dict)
    metadatas = _SArray(metadatas, dtype=dict)

    summary = _SFrame({'metric': evaluations,
                       'metadata': metadatas,
                       'parameters': parameters})

    return _OrderedDict([('models', models),
                         ('summary', summary),
                         ('status', status)])
def _combiner(**tasks):
    """
    Take the return values from each task, and return
    the combined result.

    The combined result is a tuple, where the first
    element is a list of models, and the second
    sframe is a summary sframe containing
    the searched parameters and the evaluation result.
    """
    # Concatenate output from all the tasks.
    models = []
    evaluations = []
    parameters = []
    metadatas = []
    for t in tasks.values():
        if t is not None:  # If an exception occurred, t is None
            models.append(t['model'])
            evaluations.append(t['evaluation'])
            parameters.append(t['parameters'])
            metadatas.append(t['metadata'])

    if all(m is None for m in models):
        models = None

    # SFrame contains all the evaluation results, one row per model
    if all(type(x) in (int, float, str, list, type(None))
           for x in evaluations):
        evaluation_sframe = _SFrame({'metric': evaluations})
    else:
        evaluation_sframe = _SArray(evaluations).unpack(
            column_name_prefix=None)

    # SFrame contains all metadata, one row per model
    if all(type(x) in (int, float, str, list, type(None))
           for x in metadatas):
        metadata_sframe = _SFrame({'metadata': metadatas})
    else:
        metadata_sframe = _SArray(metadatas).unpack(
            column_name_prefix=None)

    # SFrame contains all the tuning parameters, one row per model
    if all(x is None or len(x) == 0 for x in parameters):
        parameter_sframe = _SFrame(
            {'parameters': [None] * len(parameters)})
    else:
        parameter_sframe = _SArray(parameters).unpack(
            column_name_prefix=None)

    # Make a summary sframe concatenating horizontally the evalution_sframe
    # and paramter_sframe
    summary_sframe = _SFrame()
    param_columns = sorted(parameter_sframe.column_names())
    metric_columns = sorted(evaluation_sframe.column_names())
    metadata_columns = sorted(metadata_sframe.column_names())
    summary_sframe[param_columns] = parameter_sframe[param_columns]
    summary_sframe[metric_columns] = evaluation_sframe[metric_columns]
    summary_sframe[metadata_columns] = metadata_sframe[metadata_columns]
    return _OrderedDict([('models', models), ('summary', summary_sframe)])
 def _combine_sframes(summaries):
     summary = _SFrame()
     summary['metadata'] = _SArray(dtype=dict)
     summary['metric'] = _SArray(dtype=dict)
     summary['parameters'] = _SArray(dtype=dict)
     for s in summaries:
         summary = summary.append(s)
     return summary
def _raise_if_evaluator_return_is_not_packable(eval_result):
    if type(eval_result) in (int, float, str, list, _array):
        return
    try:
        _SArray([eval_result]).unpack(column_name_prefix=None)
    except:
        raise ValueError('Return of the evaluator must be a dict '
                         'with simple types.')
示例#6
0
 def _combine_sframes(summaries):
     summary = _SFrame()
     summary['metadata'] = _SArray(dtype=dict)
     summary['metric'] = _SArray(dtype=dict)
     summary['parameters'] = _SArray(dtype=dict)
     for s in summaries:
         summary = summary.append(s)
     return summary
示例#7
0
def _raise_if_evaluator_return_is_not_packable(eval_result):
    if type(eval_result) in (int, float, str, list, _array):
        return
    try:
        _SArray([eval_result]).unpack(column_name_prefix=None)
    except:
        raise ValueError('Return of the evaluator must be a dict '
                         'with simple types.')
示例#8
0
def _combiner(**tasks):
    """
    Take the return values from each task, and return
    the combined result.

    The combined result is a tuple, where the first
    element is a list of models, and the second
    sframe is a summary sframe containing
    the searched parameters and the evaluation result.
    """
    # Concatenate output from all the tasks.
    models = []
    evaluations = []
    parameters = []
    metadatas = []
    for t in tasks.values():
        if t is not None:  # If an exception occurred, t is None
            models.append(t['model'])
            evaluations.append(t['evaluation'])
            parameters.append(t['parameters'])
            metadatas.append(t['metadata'])

    if all(m is None for m in models):
        models = None

    # SFrame contains all the evaluation results, one row per model
    if all(
            type(x) in (int, float, str, list, type(None))
            for x in evaluations):
        evaluation_sframe = _SFrame({'metric': evaluations})
    else:
        evaluation_sframe = _SArray(evaluations).unpack(
            column_name_prefix=None)

    # SFrame contains all metadata, one row per model
    if all(type(x) in (int, float, str, list, type(None)) for x in metadatas):
        metadata_sframe = _SFrame({'metadata': metadatas})
    else:
        metadata_sframe = _SArray(metadatas).unpack(column_name_prefix=None)

    # SFrame contains all the tuning parameters, one row per model
    if all(x is None or len(x) == 0 for x in parameters):
        parameter_sframe = _SFrame({'parameters': [None] * len(parameters)})
    else:
        parameter_sframe = _SArray(parameters).unpack(column_name_prefix=None)

    # Make a summary sframe concatenating horizontally the evalution_sframe
    # and paramter_sframe
    summary_sframe = _SFrame()
    param_columns = sorted(parameter_sframe.column_names())
    metric_columns = sorted(evaluation_sframe.column_names())
    metadata_columns = sorted(metadata_sframe.column_names())
    summary_sframe[param_columns] = parameter_sframe[param_columns]
    summary_sframe[metric_columns] = evaluation_sframe[metric_columns]
    summary_sframe[metadata_columns] = metadata_sframe[metadata_columns]
    return _OrderedDict([('models', models), ('summary', summary_sframe)])
示例#9
0
def count_words(sa, to_lower=True,
                delimiters=["\r", "\v", "\n", "\f", "\t", " "]):
    """
    count_words(sa, to_lower=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "])

    Count words in the SArray. Return an SArray of dictionary type. The keys in
    each output dictionary are the words in the corresponding input data entry,
    and the values are the number of times the words appears. By default, words
    are split on all whitespace and newline characters. The output is commonly
    known as the "bag-of-words" representation of text data.

    Parameters
    ----------
    sa : SArray[str]
        Input data.

    to_lower : bool, optional
        If True, all words are converted to lower case before counting.

    delimiters : list[string], optional
        Input strings are delimited using characters in this list. Each entry in
        this list must contain a single character.

    Returns
    -------
    out : SArray[dict]
        Each entry contains a dictionary with the frequency count of each word
        in the corresponding input entry.

    See Also
    --------
    count_ngrams

    References
    ----------
    - `Bag of words model <http://en.wikipedia.org/wiki/Bag-of-words_model>`_

    Examples
    --------
    >>> sa = graphlab.SArray(["The quick brown fox jumps.",
    ...                       "Word word WORD, word!!!word"])
    >>> graphlab.text_analytics.count_words(sa)
    dtype: dict
    Rows: 2
    [{'quick': 1, 'brown': 1, 'jumps': 1, 'fox': 1, 'the': 1},
     {'word': 2, 'word,': 1, 'word!!!word': 1}]
    """

    _mt._get_metric_tracker().track('toolkit.text_analytics.count_words')

    ## Validate inputs
    if (sa.dtype() != str):
        raise TypeError("Only string type SArrays are supported for counting words.")

    if (not all([len(delim) == 1 for delim in delimiters])):
        raise ValueError("Delimiters must be single-character strings.")

    ## Compute word counts
    options = {'to_lower': to_lower,
               'delimiters': delimiters}

    return _SArray(_proxy=sa.__proxy__.count_bag_of_words(options))
示例#10
0
def count_ngrams(sa, n=2, method="word", to_lower=True, ignore_space=True):
    """
    Return an SArray of ``dict`` type where each element contains the count
    for each of the n-grams that appear in the corresponding input element.
    The n-grams can be specified to be either character n-grams or word
    n-grams.  The input SArray must contain strings.

    Parameters
    ----------
    sa : SArray[str]
        Input text data.

    n : int, optional
      The number of words in each n-gram. An ``n`` value of 1 returns word
      counts.

    method : {'word', 'character'}, optional
      If "word", the function performs a count of word n-grams. If
      "character", does a character n-gram count.

    to_lower : bool, optional
      If True, all words are converted to lower case before counting.

    ignore_space : bool, optional
      If method is "character", indicates if spaces between words are
      counted as part of the n-gram. For instance, with the input SArray
      element of "fun games", if this parameter is set to False one
      tri-gram would be 'n g'. If ``ignore_space`` is set to True, there
      would be no such tri-gram (there would still be 'nga'). This
      parameter has no effect if the method is set to "word".

    Returns
    -------
    out : SArray
      An SArray of dictionary type, where each key is the n-gram string
      and each value is its count.

    See Also
    --------
    count_words

    Notes
    -----
    - Ignoring case (with ``to_lower``) involves a full string copy of the
      SArray data. To increase speed for large documents, set ``to_lower`` to
      False.

    - Punctuation and spaces are both delimiters when counting word n-grams.
      When counting character n-grams, punctuation is always ignored.

    References
    ----------
    - `N-gram wikipedia article <http://en.wikipedia.org/wiki/N-gram>`_

    Examples
    --------
    Counting word n-grams:

    >>> sa = graphlab.SArray(['I like big dogs. I LIKE BIG DOGS.'])
    >>> graphlab.text_analytics.count_ngrams(sa, 3)
    dtype: dict
    Rows: 1
    [{'big dogs i': 1, 'like big dogs': 2, 'dogs i like': 1, 'i like big': 2}]

    Counting character n-grams:

    >>> sa = graphlab.SArray(['Fun. Is. Fun'])
    >>> graphlab.text_analytics.count_ngrams(sa, 3, "character")
    dtype: dict
    Rows: 1
    {'fun': 2, 'nis': 1, 'sfu': 1, 'isf': 1, 'uni': 1}]
    """

    _mt._get_metric_tracker().track('toolkit.text_analytics.count_ngrams')

    ## Validate inputs
    if sa.dtype() != str:
        raise TypeError("Only string type SArrays are supported for counting n-grams.")

    if not isinstance(n, int):
        raise TypeError("Input 'n' must be of type int.")

    if n < 1:
        raise ValueError("Input 'n' must be greater than 0")

    if n > 5 and method == 'word':
        warnings.warn("It is unusual for n-grams to be of size larger than 5.")

    ## Compute n-gram counts
    options = {'to_lower': to_lower,
               'ignore_space': ignore_space}

    if method == 'word':
        result = _SArray(_proxy=sa.__proxy__.count_ngrams(n, options))
    elif method == 'character':
        result = _SArray(_proxy=sa.__proxy__.count_character_ngrams(n, options))
    else:
        raise ValueError("Invalid 'method' input  value. Please input " +
                         "either 'word' or 'character' ")

    return result