def __init__(self, factory, parameter_sets, name, strategy=None, environment=None, return_model=True): """ Constructor for a ModelSearchJob. """ self.factory = factory self.parameter_sets = parameter_sets self.name = name self.strategy = strategy self.return_model = return_model self.environment = environment def get_max_model_id(parameter_sets): max_model_id = 0 for ps in parameter_sets: model_id = ps['metadata']['model_id'] max_model_id = max(model_id, max_model_id) return max_model_id self.max_model_id = get_max_model_id(parameter_sets) # Create batches of parameter sets def chunks(l, n): """ Yield successive n-sized chunks from l. """ for i in xrange(0, len(l), n): yield l[i:i + n] # Tuning parameter for dividing jobs into batches batch_size = max(10, int(math.ceil(len(parameter_sets) / 3.0))) parameter_batches = [c for c in chunks(parameter_sets, batch_size)] # Construct jobs self.jobs = [] for i, parameter_set in enumerate(parameter_batches): job_name = name + '%05d' % i job = _map_job.create(factory, parameter_set, name=job_name, environment=environment, combiner_function=_combine_mps_tasks) self.jobs.append(job)
def __init__(self, factory, parameter_sets, name, strategy=None, environment=None, return_model=True): """ Constructor for a ModelSearchJob. """ self.factory = factory self.parameter_sets = parameter_sets self.name = name self.strategy = strategy self.return_model = return_model self.environment = environment def get_max_model_id(parameter_sets): max_model_id = 0 for ps in parameter_sets: model_id = ps['metadata']['model_id'] max_model_id = max(model_id, max_model_id) return max_model_id self.max_model_id = get_max_model_id(parameter_sets) # Create batches of parameter sets def chunks(l, n): """ Yield successive n-sized chunks from l. """ for i in xrange(0, len(l), n): yield l[i:i+n] # Tuning parameter for dividing jobs into batches batch_size = max(10, int(math.ceil(len(parameter_sets) / 3.0))) parameter_batches = [c for c in chunks(parameter_sets, batch_size)] # Construct jobs self.jobs = [] for i, parameter_set in enumerate(parameter_batches): job_name = name + '%05d' % i job = _map_job.create(factory, parameter_set, name=job_name, environment=environment, combiner_function=_combine_mps_tasks) self.jobs.append(job)
def cross_val_score(datasets, model_factory, model_parameters, evaluator=_default_evaluator, environment=None, return_model=True): """ Evaluate model performance via cross validation for a given set of parameters. Parameters ---------- {param_data} {param_model_factory} model_parameters : dict The params argument takes a dictionary containing parameters that will be passed to the provided model factory. {param_evaluator} {param_environment} {param_return_model} {param_returns} See Also -------- graphlab.toolkits.model_parameter_search.create Examples -------- >>> url = 'http://s3.amazonaws.com/gl-testdata/xgboost/mushroom.csv' >>> data = gl.SFrame.read_csv(url) >>> data['label'] = (data['label'] == 'p') >>> folds = gl.cross_validation.KFold(data, 5) >>> params = dict([('target', 'label'), ('max_depth', 5)]) >>> job = gl.cross_validation.cross_val_score(folds, gl.boosted_trees_classifier.create, params) >>> print job.get_results() """ _get_metric_tracker().track('cross_validation.cross_val_score') if isinstance(datasets, _graphlab.SFrame): folds = [(datasets, None)] elif isinstance(datasets, tuple): if len(datasets) != 2: raise ValueError("Provided dataset tuple must be train/test pair.") folds = [datasets] else: folds = datasets if (not isinstance(folds, KFold)): folds = KFold.from_list(folds) num_folds = folds.num_folds include_fold_id = num_folds > 1 model_factory = _check_if_sklearn_factory(model_factory, model_parameters) params = [] model_id = 0 for fold_id in range(num_folds): metadata = {'model_id': model_id} if include_fold_id: metadata['fold_id'] = fold_id model_id += 1 params.append({ 'model_factory': model_factory, 'model_parameters': model_parameters, 'folds': folds, 'evaluator': evaluator, 'return_model': return_model, 'metadata': metadata }) now = _datetime.now().strftime('%b-%d-%Y-%H-%M-%S-%f') random_hash = str(hash( (id(folds), ("%.21f" % _time()) ) ) )[:8] job_name = "Cross-Validation-%s-%s" % (now, random_hash) return _map_job.create(_train_test_model, parameter_set=params, name=job_name, environment=environment, combiner_function=_combiner)
def cross_val_score(datasets, model_factory, model_parameters, evaluator=_default_evaluator, environment=None, return_model=True): """ Evaluate model performance via cross validation for a given set of parameters. Parameters ---------- {param_data} {param_model_factory} model_parameters : dict The params argument takes a dictionary containing parameters that will be passed to the provided model factory. {param_evaluator} {param_environment} {param_return_model} {param_returns} See Also -------- graphlab.toolkits.model_parameter_search.create Examples -------- >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> data = gl.SFrame.read_csv(url) >>> data['label'] = (data['label'] == 'p') >>> folds = gl.cross_validation.KFold(data, 5) >>> params = dict([('target', 'label'), ('max_depth', 5)]) >>> job = gl.cross_validation.cross_val_score(folds, gl.boosted_trees_classifier.create, params) >>> print job.get_results() """ _get_metric_tracker().track('cross_validation.cross_val_score') if isinstance(datasets, _graphlab.SFrame): folds = [(datasets, None)] elif isinstance(datasets, tuple): if len(datasets) != 2: raise ValueError("Provided dataset tuple must be train/test pair.") folds = [datasets] else: folds = datasets if (not isinstance(folds, KFold)): folds = KFold.from_list(folds) num_folds = folds.num_folds include_fold_id = num_folds > 1 params = [] model_id = 0 for fold_id in range(num_folds): metadata = {'model_id': model_id} if include_fold_id: metadata['fold_id'] = fold_id model_id += 1 params.append({ 'model_factory': _check_if_sklearn_factory(model_factory, model_parameters), 'model_parameters': model_parameters, 'folds': folds, 'evaluator': evaluator, 'return_model': return_model, 'metadata': metadata }) now = _datetime.now().strftime('%b-%d-%Y-%H-%M-%S-%f') random_hash = str(hash((id(folds), ("%.21f" % _time()))))[:8] job_name = "Cross-Validation-%s-%s" % (now, random_hash) return _map_job.create(_train_test_model, parameter_set=params, name=job_name, environment=environment, combiner_function=_combiner)