def cross_val_score(datasets, model_factory, model_parameters, evaluator=_default_evaluator, environment=None, return_model=True): """ Evaluate model performance via cross validation for a given set of parameters. Parameters ---------- {param_data} {param_model_factory} model_parameters : dict The params argument takes a dictionary containing parameters that will be passed to the provided model factory. {param_evaluator} {param_environment} {param_return_model} {param_returns} See Also -------- graphlab.toolkits.model_parameter_search.create Examples -------- >>> url = 'http://s3.amazonaws.com/gl-testdata/xgboost/mushroom.csv' >>> data = gl.SFrame.read_csv(url) >>> data['label'] = (data['label'] == 'p') >>> folds = gl.cross_validation.KFold(data, 5) >>> params = dict([('target', 'label'), ('max_depth', 5)]) >>> job = gl.cross_validation.cross_val_score(folds, gl.boosted_trees_classifier.create, params) >>> print job.get_results() """ _get_metric_tracker().track('cross_validation.cross_val_score') if isinstance(datasets, _graphlab.SFrame): folds = [(datasets, None)] elif isinstance(datasets, tuple): if len(datasets) != 2: raise ValueError("Provided dataset tuple must be train/test pair.") folds = [datasets] else: folds = datasets if (not isinstance(folds, KFold)): folds = KFold.from_list(folds) num_folds = folds.num_folds include_fold_id = num_folds > 1 model_factory = _check_if_sklearn_factory(model_factory, model_parameters) params = [] model_id = 0 for fold_id in range(num_folds): metadata = {'model_id': model_id} if include_fold_id: metadata['fold_id'] = fold_id model_id += 1 params.append({ 'model_factory': model_factory, 'model_parameters': model_parameters, 'folds': folds, 'evaluator': evaluator, 'return_model': return_model, 'metadata': metadata }) now = _datetime.now().strftime('%b-%d-%Y-%H-%M-%S-%f') random_hash = str(hash( (id(folds), ("%.21f" % _time()) ) ) )[:8] job_name = "Cross-Validation-%s-%s" % (now, random_hash) return _map_job.create(_train_test_model, parameter_set=params, name=job_name, environment=environment, combiner_function=_combiner)
def create(datasets, model_factory, model_parameters, evaluator=_default_evaluator, environment=None, return_model=True, perform_trial_run=True, max_models=10): """ Evaluate model performance, in parallel, over a set of parameters, where the parameters are chosen randomly. Parameters ---------- {param_data} {param_model_factory} {param_model_params} A user can also specify a random variable as the value for an argument. For each model, the parameter value will be sampled from this distribution. For a given scipy.distribution, v, each model will first call v.rvs(1) to sample a single value from the distribution. For example, 'step_size': scipy.stats.distribution.expon(.1) would choose step_size to be the result of calling the `rvs` method on the exponential distribution. {param_evaluator} {param_environment} {param_return_model} {param_perform_trial_run} {param_max_models} {param_returns} See Also -------- graphlab.toolkits.model_parameter_search.create, graphlab.toolkits.model_parameter_search.manual_search.create Examples -------- Perform a random search on a single train/test split. .. sourcecode:: python >>> import scipy.stats >>> sf = gl.SFrame() >>> sf['x'] = range(100) >>> sf['y'] = [0, 1]* 50 >>> train, valid = sf.random_split(.5) >>> params = dict([('target', 'y'), ('step_size', scipy.stats.distributions.expon(.1)), ('max_depth', [5, 7])]) >>> job = gl.random_search.create((train, valid), gl.boosted_trees_regression.create, params) >>> job.get_results() Perform a random search on a k-fold split. .. sourcecode:: python >>> folds = gl.cross_validation.KFold(sf, 5) >>> params = dict([('target', 'y'), ('step_size', scipy.stats.distributions.expon(.1)), ('max_depth', [5, 7])]) >>> job = gl.random_search.create(folds, gl.boosted_trees_classifier.create, params) >>> job.get_results() """ # Create a model_factory if the provided factory is from sklearn model_factory = _check_if_sklearn_factory(model_factory, model_parameters) # Construct an iterable of all the desired free_param settings. model_param_list = [] for _ in range(max_models): model_params = _random_choice(model_parameters) model_param_list.append(model_params) return _create_model_search(datasets, model_factory, model_param_list, strategy='random', evaluator=evaluator, environment=environment, return_model=return_model, perform_trial_run=perform_trial_run)
def cross_val_score(datasets, model_factory, model_parameters, evaluator=_default_evaluator, environment=None, return_model=True): """ Evaluate model performance via cross validation for a given set of parameters. Parameters ---------- {param_data} {param_model_factory} model_parameters : dict The params argument takes a dictionary containing parameters that will be passed to the provided model factory. {param_evaluator} {param_environment} {param_return_model} {param_returns} See Also -------- graphlab.toolkits.model_parameter_search.create Examples -------- >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> data = gl.SFrame.read_csv(url) >>> data['label'] = (data['label'] == 'p') >>> folds = gl.cross_validation.KFold(data, 5) >>> params = dict([('target', 'label'), ('max_depth', 5)]) >>> job = gl.cross_validation.cross_val_score(folds, gl.boosted_trees_classifier.create, params) >>> print job.get_results() """ _get_metric_tracker().track('cross_validation.cross_val_score') if isinstance(datasets, _graphlab.SFrame): folds = [(datasets, None)] elif isinstance(datasets, tuple): if len(datasets) != 2: raise ValueError("Provided dataset tuple must be train/test pair.") folds = [datasets] else: folds = datasets if (not isinstance(folds, KFold)): folds = KFold.from_list(folds) num_folds = folds.num_folds include_fold_id = num_folds > 1 params = [] model_id = 0 for fold_id in range(num_folds): metadata = {'model_id': model_id} if include_fold_id: metadata['fold_id'] = fold_id model_id += 1 params.append({ 'model_factory': _check_if_sklearn_factory(model_factory, model_parameters), 'model_parameters': model_parameters, 'folds': folds, 'evaluator': evaluator, 'return_model': return_model, 'metadata': metadata }) now = _datetime.now().strftime('%b-%d-%Y-%H-%M-%S-%f') random_hash = str(hash((id(folds), ("%.21f" % _time()))))[:8] job_name = "Cross-Validation-%s-%s" % (now, random_hash) return _map_job.create(_train_test_model, parameter_set=params, name=job_name, environment=environment, combiner_function=_combiner)
def create(datasets, model_factory, model_parameters, evaluator=_default_evaluator, environment=None, return_model=True, perform_trial_run=True): """ Evaluate model performance, in parallel, over a grid of parameters. Parameters ---------- {param_data} {param_model_factory} {param_model_params} The collection of all combinations of valid parameter values defines a grid of model parameters that will be considered. {param_evaluator} {param_environment} {param_return_model} {param_perform_trial_run} {param_returns} See Also -------- graphlab.toolkits.model_parameter_search.create, graphlab.toolkits.model_parameter_search.random_search.create, graphlab.toolkits.cross_validation.cross_val_score Examples -------- Perform a grid search on a single train/test split. >>> train, valid = sf.random_split() >>> params = dict([('target', 'Y'), ('step_size', [0.01, 0.1]), ('max_depth', [5, 7])]) >>> job = gl.grid_search.create((train, valid), gl.boosted_trees_classifier.create, params) >>> job.get_results() Perform a grid search on a k-fold split. >>> folds = gl.cross_validation.KFold(sf, 5) >>> params = dict([('target', 'Y'), ('step_size', [0.01, 0.1]), ('max_depth', [5, 7])]) >>> job = gl.grid_search.create(folds, gl.boosted_trees_classifier.create, params) >>> job.get_results() """ model_factory = _check_if_sklearn_factory(model_factory, model_parameters) search_space = _get_all_parameters_combinations(model_parameters) return _create_model_search(datasets, model_factory, search_space, strategy='grid', evaluator=evaluator, environment=environment, return_model=return_model, perform_trial_run=perform_trial_run)