示例#1
0
def evaluate_topic_models(data,
                          varying_parameters,
                          constant_parameters=None,
                          n_max_processes=None,
                          return_models=False,
                          metric=None,
                          **metric_kwargs):
    """
    Compute several Topic Models in parallel using the "gensim" package. Calculate the models using a list of varying
    parameters `varying_parameters` on a single Document-Term-Matrix `data`. Pass parameters in `constant_parameters`
    dict to each model calculation. Use at maximum `n_max_processes` processors or use all available processors if None
    is passed.
    `data` must be a Document-Term-Matrix (NumPy array/matrix, SciPy sparse matrix).
    Will return a list of size `len(varying_parameters)` containing tuples `(parameter_set, eval_results)` where
    `parameter_set` is a dict of the used parameters and `eval_results` is a dict of metric names -> metric results.
    """
    mp_eval = MultiprocEvaluationRunner(MultiprocEvaluationWorkerGensim,
                                        AVAILABLE_METRICS,
                                        data,
                                        varying_parameters,
                                        constant_parameters,
                                        metric=metric or DEFAULT_METRICS,
                                        metric_options=metric_kwargs,
                                        n_max_processes=n_max_processes,
                                        return_models=return_models)

    return mp_eval.run()
示例#2
0
def evaluate_topic_models(data,
                          varying_parameters,
                          constant_parameters=None,
                          n_max_processes=None,
                          return_models=False,
                          metric=None,
                          **metric_kwargs):
    """
    Compute several Topic Models in parallel using the "gensim" package. Calculate the models using a list of varying
    parameters `varying_parameters` on a single Document-Term-Matrix `data`. Pass parameters in `constant_parameters`
    dict to each model calculation. Use at maximum `n_max_processes` processors or use all available processors if None
    is passed.

    `data` must be a Document-Term-Matrix (NumPy array/matrix, SciPy sparse matrix).

    Will return a list of size `len(varying_parameters)` containing tuples `(parameter_set, eval_results)` where
    `parameter_set` is a dict of the used parameters and `eval_results` is a dict of metric names -> metric results:

    .. code-block:: text

        [(parameter_set_1, {'<metric_name>': result_1, ...}),
         ...,
         (parameter_set_n, {'<metric_name>': result_n, ...})])

    .. seealso:: Results can be simplified using :func:`tmtoolkit.topicmod.evaluate.results_by_parameter`.

    :param data: a (sparse) 2D array/matrix
    :param varying_parameters: list of dicts with parameters; each parameter set will be used in a separate
                               evaluation
    :param constant_parameters: dict with parameters that are the same for all parallel computations
    :param n_max_processes: maximum number of worker processes to spawn
    :param return_models: if True, also return the computed models in the evaluation results
    :param metric: string or list of strings; if given, use only this metric(s) for evaluation; must be subset of
                   `available_metrics`
    :param metric_kwargs: dict of options for metric used metric(s)
    :return: list of evaluation results for each varying parameter set as described above
    """
    mp_eval = MultiprocEvaluationRunner(MultiprocEvaluationWorkerGensim,
                                        AVAILABLE_METRICS,
                                        data,
                                        varying_parameters,
                                        constant_parameters,
                                        metric=metric or DEFAULT_METRICS,
                                        metric_options=metric_kwargs,
                                        n_max_processes=n_max_processes,
                                        return_models=return_models)

    return mp_eval.run()