예제 #1
0
    def __init__(self,
                 factory,
                 parameter_sets,
                 name,
                 strategy=None,
                 environment=None,
                 return_model=True):
        """
        Constructor for a ModelSearchJob.
        """

        self.factory = factory
        self.parameter_sets = parameter_sets
        self.name = name
        self.strategy = strategy
        self.return_model = return_model
        self.environment = environment

        def get_max_model_id(parameter_sets):
            max_model_id = 0
            for ps in parameter_sets:
                model_id = ps['metadata']['model_id']
                max_model_id = max(model_id, max_model_id)
            return max_model_id

        self.max_model_id = get_max_model_id(parameter_sets)

        # Create batches of parameter sets
        def chunks(l, n):
            """
            Yield successive n-sized chunks from l.
            """
            for i in xrange(0, len(l), n):
                yield l[i:i + n]

        # Tuning parameter for dividing jobs into batches
        batch_size = max(10, int(math.ceil(len(parameter_sets) / 3.0)))
        parameter_batches = [c for c in chunks(parameter_sets, batch_size)]

        # Construct jobs
        self.jobs = []
        for i, parameter_set in enumerate(parameter_batches):
            job_name = name + '%05d' % i
            job = _map_job.create(factory,
                                  parameter_set,
                                  name=job_name,
                                  environment=environment,
                                  combiner_function=_combine_mps_tasks)
            self.jobs.append(job)
    def __init__(self, factory,
                 parameter_sets,
                 name,
                 strategy=None,
                 environment=None,
                 return_model=True):
        """
        Constructor for a ModelSearchJob.
        """

        self.factory = factory
        self.parameter_sets = parameter_sets
        self.name = name
        self.strategy = strategy
        self.return_model = return_model
        self.environment = environment

        def get_max_model_id(parameter_sets):
            max_model_id = 0
            for ps in parameter_sets:
                model_id = ps['metadata']['model_id']
                max_model_id = max(model_id, max_model_id)
            return max_model_id

        self.max_model_id = get_max_model_id(parameter_sets)

        # Create batches of parameter sets
        def chunks(l, n):
            """
            Yield successive n-sized chunks from l.
            """
            for i in xrange(0, len(l), n):
                yield l[i:i+n]

        # Tuning parameter for dividing jobs into batches
        batch_size = max(10, int(math.ceil(len(parameter_sets) / 3.0)))

        parameter_batches = [c for c in chunks(parameter_sets, batch_size)]

        # Construct jobs
        self.jobs = []
        for i, parameter_set in enumerate(parameter_batches):
            job_name = name + '%05d' % i
            job = _map_job.create(factory, parameter_set,
                                  name=job_name,
                                  environment=environment,
                                  combiner_function=_combine_mps_tasks)
            self.jobs.append(job)
def cross_val_score(datasets,
                    model_factory,
                    model_parameters,
                    evaluator=_default_evaluator,
                    environment=None,
                    return_model=True):
    """
    Evaluate model performance via cross validation for a given set of
    parameters.

    Parameters
    ----------
    {param_data}
    {param_model_factory}

    model_parameters : dict
        The params argument takes a dictionary containing parameters that will
        be passed to the provided model factory.

    {param_evaluator}
    {param_environment}
    {param_return_model}
    {param_returns}

    See Also
    --------
    graphlab.toolkits.model_parameter_search.create

    Examples
    --------
    >>> url = 'http://s3.amazonaws.com/gl-testdata/xgboost/mushroom.csv'
    >>> data = gl.SFrame.read_csv(url)
    >>> data['label'] = (data['label'] == 'p')
    >>> folds = gl.cross_validation.KFold(data, 5)
    >>> params = dict([('target', 'label'), ('max_depth', 5)])
    >>> job = gl.cross_validation.cross_val_score(folds,
                                                  gl.boosted_trees_classifier.create,
                                                  params)
    >>> print job.get_results()
    """
    _get_metric_tracker().track('cross_validation.cross_val_score')

    if isinstance(datasets, _graphlab.SFrame):
        folds = [(datasets, None)]
    elif isinstance(datasets, tuple):
        if len(datasets) != 2:
            raise ValueError("Provided dataset tuple must be train/test pair.")
        folds = [datasets]
    else:
        folds = datasets

    if (not isinstance(folds, KFold)):
        folds = KFold.from_list(folds)

    num_folds = folds.num_folds
    include_fold_id = num_folds > 1

    model_factory = _check_if_sklearn_factory(model_factory, model_parameters)

    params = []
    model_id = 0

    for fold_id in range(num_folds):

        metadata = {'model_id': model_id}
        if include_fold_id:
            metadata['fold_id'] = fold_id
        model_id += 1

        params.append({
            'model_factory': model_factory,
            'model_parameters': model_parameters,
            'folds': folds,
            'evaluator': evaluator,
            'return_model': return_model,
            'metadata': metadata
        })
    now = _datetime.now().strftime('%b-%d-%Y-%H-%M-%S-%f')

    random_hash = str(hash( (id(folds), ("%.21f" % _time()) ) ) )[:8]

    job_name = "Cross-Validation-%s-%s" % (now, random_hash)

    return _map_job.create(_train_test_model,
                           parameter_set=params,
                           name=job_name,
                           environment=environment,
                           combiner_function=_combiner)
def cross_val_score(datasets,
                    model_factory,
                    model_parameters,
                    evaluator=_default_evaluator,
                    environment=None,
                    return_model=True):
    """
    Evaluate model performance via cross validation for a given set of
    parameters.

    Parameters
    ----------
    {param_data}
    {param_model_factory}

    model_parameters : dict
        The params argument takes a dictionary containing parameters that will
        be passed to the provided model factory.

    {param_evaluator}
    {param_environment}
    {param_return_model}
    {param_returns}

    See Also
    --------
    graphlab.toolkits.model_parameter_search.create

    Examples
    --------
    >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
    >>> data = gl.SFrame.read_csv(url)
    >>> data['label'] = (data['label'] == 'p')
    >>> folds = gl.cross_validation.KFold(data, 5)
    >>> params = dict([('target', 'label'), ('max_depth', 5)])
    >>> job = gl.cross_validation.cross_val_score(folds,
                                                  gl.boosted_trees_classifier.create,
                                                  params)
    >>> print job.get_results()
    """
    _get_metric_tracker().track('cross_validation.cross_val_score')

    if isinstance(datasets, _graphlab.SFrame):
        folds = [(datasets, None)]
    elif isinstance(datasets, tuple):
        if len(datasets) != 2:
            raise ValueError("Provided dataset tuple must be train/test pair.")
        folds = [datasets]
    else:
        folds = datasets

    if (not isinstance(folds, KFold)):
        folds = KFold.from_list(folds)

    num_folds = folds.num_folds
    include_fold_id = num_folds > 1

    params = []
    model_id = 0

    for fold_id in range(num_folds):

        metadata = {'model_id': model_id}
        if include_fold_id:
            metadata['fold_id'] = fold_id
        model_id += 1

        params.append({
            'model_factory':
            _check_if_sklearn_factory(model_factory, model_parameters),
            'model_parameters':
            model_parameters,
            'folds':
            folds,
            'evaluator':
            evaluator,
            'return_model':
            return_model,
            'metadata':
            metadata
        })
    now = _datetime.now().strftime('%b-%d-%Y-%H-%M-%S-%f')

    random_hash = str(hash((id(folds), ("%.21f" % _time()))))[:8]

    job_name = "Cross-Validation-%s-%s" % (now, random_hash)

    return _map_job.create(_train_test_model,
                           parameter_set=params,
                           name=job_name,
                           environment=environment,
                           combiner_function=_combiner)