def _build_pipeline(self): self._pipeline = MLPipeline(self.template) if self._hyperparameters: self._pipeline.set_hyperparameters(self._hyperparameters) self.fitted = False
def _get_tunables(cls, template_dicts): tunables = {} for name, template in template_dicts.items(): pipeline = MLPipeline(template) pipeline_tunables = pipeline.get_tunable_hyperparameters(flat=True) tunables[name] = Tunable.from_dict(pipeline_tunables) return tunables
def _load_pipeline(pipeline, hyperparams=None): if isinstance(pipeline, str) and os.path.isfile(pipeline): pipeline = MLPipeline.load(pipeline) else: pipeline = MLPipeline(pipeline) if hyperparams is not None: pipeline.set_hyperparameters(hyperparams) return pipeline
def _get_mlpipeline(self): pipeline = self._pipeline if isinstance(pipeline, str) and os.path.isfile(pipeline): with open(pipeline) as json_file: pipeline = json.load(json_file) mlpipeline = MLPipeline(pipeline) if self._hyperparameters: mlpipeline.set_hyperparameters(self._hyperparameters) return mlpipeline
def _load_pipeline(pipeline): if isinstance(pipeline, MLPipeline): return pipeline if isinstance(pipeline, str): return MLPipeline.load(pipeline) if isinstance(pipeline, dict): return MLPipeline.from_dict(pipeline) raise ValueError('Invalid pipeline %s', pipeline)
def build_pipeline(pipeline_spec): pipeline = MLPipeline( pipeline_spec['primitives'], pipeline_spec.get('init_params', dict()), pipeline_spec.get('input_names', dict()), pipeline_spec.get('output_names', dict()), ) hyperparameters = pipeline_spec.get('hyperparameters') if hyperparameters: pipeline.set_hyperparameters(hyperparameters) return pipeline
def _load_mlpipeline(self, template): if not isinstance(template, dict): template = self._load_template(template) self.template = template return MLPipeline.from_dict(template)
def cv_score(self, X, y, context, metric=None, cv=None): """Cross Validate this pipeline.""" scorer = METRICS_DICT[metric or self.metric] LOGGER.debug('CV Scoring pipeline %s', self) self.cv_scores = list() for fold, (train_index, test_index) in enumerate(cv.split(X, y)): LOGGER.debug('Scoring fold: %s', fold) X_train, y_train = self._get_split(X, y, train_index) pipeline = MLPipeline.from_dict(self._tunable) pipeline.fit(X_train, y_train, **context) X_test, y_test = self._get_split(X, y, test_index) pred = pipeline.predict(X_test, **context) score = scorer(pred, y_test) self.cv_scores.append(score) LOGGER.debug('Fold %s score: %s', fold, score) score, std, rank = self._get_score() LOGGER.debug('CV score: %s +/- %s; rank: %s', score, std, rank) self.score = score self.std = std self.rank = rank + random.random() * 1.e-12 # to avoid collisions
def test_jsons(): """Validate MLBlocks primitive jsons""" primitives = (f for f in os.listdir(MLPRIMITIVES_JSONS_PATH) if f.endswith('.json')) for primitive_filename in primitives: try: primitive_path = os.path.join(MLPRIMITIVES_JSONS_PATH, primitive_filename) with open(primitive_path, 'r') as f: primitive = json.load(f) primitive_name = primitive['name'] fixed_hyperparameters = primitive.get('hyperparameters', dict()).get('fixed', dict()) init_hyperparameters = dict() for name, hyperparameter in fixed_hyperparameters.items(): if 'default' not in hyperparameter: type_ = hyperparameter.get('type') init_hyperparameters[name] = HYPERPARAMETER_DEFAULTS.get(type_) block_name = primitive_name + '#1' mlpipeline = MLPipeline([primitive_name], {block_name: init_hyperparameters}) # Validate methods mlblock = mlpipeline.blocks[block_name] if mlblock._class: fit = primitive.get('fit') if fit: assert hasattr(mlblock.instance, fit['method']) produce = primitive['produce'] assert hasattr(mlblock.instance, produce['method']) except Exception: raise ValueError("Invalid JSON primitive: {}".format(primitive_filename))
def load(self): """Load this Template as an MLPipeline. Returns: MLPipeline """ return MLPipeline(self.json)
def pipeline_score(pipeline_dict, X, y, scorer, context=None, n_splits=5, cv=None, random_state=0): context = context or dict() LOGGER.debug('CV Scoring pipeline %s') cv_scores = list() if not cv: metadata = pipeline_dict.get('metadata', pipeline_dict.get('loader', dict())) if metadata.get('task_type') == 'classification': cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state) else: cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state) for fold, (train_index, test_index) in enumerate(cv.split(X, y)): LOGGER.debug('Scoring fold: %s', fold) X_train, y_train = get_split(X, y, train_index) pipeline = MLPipeline.from_dict(pipeline_dict) pipeline.fit(X_train, y_train, **context) X_test, y_test = get_split(X, y, test_index) pred = pipeline.predict(X_test, **context) score = scorer(pred, y_test) cv_scores.append(score) LOGGER.debug('Fold %s score: %s', fold, score) return np.mean(cv_scores), np.std(cv_scores)
def fit(self, data_params): """Fit the pipeline on the given params.""" X, y = data_params.X, data_params.y self.pipeline = MLPipeline.from_dict(self.pipeline_dict) self.pipeline.fit(X, y, **data_params.context) self.fitted = True
def pipeline(self): """Pipeline. Returns: MLPipeline: The pipeline in the modeler. """ return MLPipeline(self._pipeline)
def _generate_splits(self, X, y, readings): if self._preprocessing: pipeline = MLPipeline(self.template) LOGGER.debug('Running %s preprocessing steps', self._preprocessing) context = pipeline.fit(X=X, y=y, readings=readings, output_=self._preprocessing - 1) del context['X'] del context['y'] else: context = {'readings': readings} splits = list() for fold, (train_index, test_index) in enumerate(self._cv.split(X, y)): LOGGER.debug('Running static steps for fold %s', fold) X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] pipeline = MLPipeline(self.template) fit = pipeline.fit(X_train, y_train, output_=self._static - 1, start_=self._preprocessing, **context) predict = pipeline.predict(X_test, output_=self._static - 1, start_=self._preprocessing, **context) splits.append((fold, pipeline, fit, predict, y_test)) return splits
def test_load_orion_mlpipeline(self, tmpdir): pipeline = MLPipeline('dummy') orion = functional._load_orion(pipeline) assert isinstance(orion, Orion) assert orion._pipeline == pipeline assert not orion._fitted assert orion._hyperparameters is None
def test_mlpipeline(self): primitives = ['sklearn.ensemble.RandomForestClassifier'] init_params = { 'sklearn.ensemble.RandomForest#1': { 'n_estimators': 500 } } pipeline = MLPipeline(primitives=primitives, init_params=init_params) pipeline2 = MLPipeline(pipeline) assert pipeline2.primitives == [ 'sklearn.ensemble.RandomForestClassifier' ] assert pipeline2.init_params == { 'sklearn.ensemble.RandomForest#1': { 'n_estimators': 500 } }
def preprocess(self, X, y, context): """Execute the preprocessing steps of the pipeline.""" if self._preprocessing: LOGGER.info("Executing preprocessing pipeline") pipeline = MLPipeline.from_dict(self._preprocessing) pipeline.fit(X, y, **context) return pipeline.predict(X, **context) else: LOGGER.info("No preprocessing steps found") return X
def create_pipeline(self, primitives, hyperparameters=None): """Creates a pipeline of primitives. Args: primitives: A list of primitive. hyperparameters: A dictionary of hyperparameters for each primitives. Returns: A MLPipeline instance. """ self.primitive = self.check_path(primitives) if hyperparameters is not None: hyperparameters = self.check_path_hyperparameters(hyperparameters) pipeline = MLPipeline(self.primitive, hyperparameters) else: pipeline = MLPipeline(self.primitive) return pipeline
def _generate_splits(self, template_name, target_times, readings, turbines=None): template = self._template_dicts.get(template_name) pipeline = MLPipeline(template) preprocessing = self._preprocessing.get(template_name) static = self._count_static_steps(pipeline) X = target_times[['turbine_id', 'cutoff_time']] y = target_times['target'] if preprocessing: if preprocessing > static: raise ValueError('Preprocessing cannot be bigger than static') LOGGER.debug('Running %s preprocessing steps', preprocessing) context = pipeline.fit(X=X, y=y, readings=readings, turbines=turbines, output_=preprocessing - 1) del context['X'] del context['y'] gc.collect() else: context = { 'readings': readings, 'turbines': turbines, } splits = list() for fold, (train_index, test_index) in enumerate(self._cv.split(X, y)): LOGGER.debug('Running static steps for fold %s', fold) gc.collect() X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] pipeline = MLPipeline(template) fit = pipeline.fit(X_train, y_train, output_=static - 1, start_=preprocessing, **context) predict = pipeline.predict(X_test, output_=static - 1, start_=preprocessing, **context) split = (fold, pipeline, fit, predict, y_test, static) if self._cache_path: split_name = '{}_{}.pkl'.format(template_name, fold) split_path = os.path.join(self._cache_path, split_name) os.makedirs(os.path.dirname(split_path), exist_ok=True) with open(split_path, 'wb') as split_file: pickle.dump(split, split_file) split = split_path splits.append(split) gc.collect() return splits
def add_template(self, name, template=None): """Add a new Template object to the database. The template can be passed as a name of a registered MLPipeline, or as a path to an MLPipeline JSON specification, or as a full dictionary specification of an MLPipeline or directly as an MLPipeline instance. If the ``template`` argument is not passed, the given ``name`` will be used to load an MLPipeline. During this step, apart from the Template object, a new Pipeline object using the default hyperparameters and with the same name as the Template will also be created. Args: name (str): Name of the Template. template (str, dict or MLPipeline): Name of the MLBlocks template to load or path to its JSON file or dictionary specification or MLPipeline instance. If not given, the ``name`` of the template is used. Raises: NotUniqueError: If a Template with the same name already exists. Returns: Template """ template = template or name if isinstance(template, str) and os.path.isfile(template): with open(template, 'r') as f: template = json.load(f) pipeline_dict = MLPipeline(template).to_dict() template = schema.Template.insert( name=name, json=pipeline_dict, created_by=self.user ) schema.Pipeline.insert( name=name, template=template, json=pipeline_dict, created_by=self.user ) return template
def _load_mlpipeline(self, template): if not isinstance(template, dict): template_name = template if os.path.isfile(template_name): with open(template_name, 'r') as template_file: template = json.load(template_file) elif self._db: template = self._db.load_template(template_name) if not template: raise ValueError('Unknown template {}'.format(template_name)) self.template = template return MLPipeline.from_dict(template)
def train_mlblocks(X_train, X_test, y_train, y_test, mtype, common_name_model, problemtype, classes, default_featurenames, transform_model, settings, model_session): # name model model_name = common_name_model + '.pickle' files = list() if mtype == 'c': # aet up primitives and train model primitives = ['sklearn.impute.SimpleImputer', 'xgboost.XGBClassifier'] init_params = { 'sklearn.impute.SimpleImputer': { 'strategy': 'median' }, 'xgboost.XGBClassifier': { 'learning_rate': 0.1 } } pipeline = MLPipeline(primitives, init_params=init_params) pipeline.fit(X_train, y_train) if mtype == 'r': # aet up primitives and train model primitives = ['sklearn.impute.SimpleImputer', 'xgboost.XGBRegressor'] pipeline = MLPipeline(primitives) pipeline.fit(X_train, y_train) # saving model print('saving model') modelfile = open(model_name, 'wb') pickle.dump(pipeline, modelfile) modelfile.close() # get model directory files.append(model_name) model_dir = os.getcwd() return model_name, model_dir, files
def test_dict(self): pipeline_dict = { 'primitives': ['sklearn.ensemble.RandomForestClassifier'], 'init_params': { 'sklearn.ensemble.RandomForest#1': { 'n_estimators': 500 } }, 'input_names': { 'sklearn.ensemble.RandomForest#1': { 'X': 'X1' } }, 'output_names': { 'sklearn.ensemble.RandomForest#1': { 'y': 'y1' } } } pipeline = MLPipeline(pipeline_dict) assert pipeline.primitives == [ 'sklearn.ensemble.RandomForestClassifier' ] assert pipeline.init_params == { 'sklearn.ensemble.RandomForest#1': { 'n_estimators': 500 } } assert pipeline.input_names == { 'sklearn.ensemble.RandomForest#1': { 'X': 'X1' } } assert pipeline.output_names == { 'sklearn.ensemble.RandomForest#1': { 'y': 'y1' } }
def __init__(self, pipeline_dict, loader, metric, problem_doc): self.pipeline_dict = pipeline_dict self.name = pipeline_dict['name'] self.template = pipeline_dict.get('template') self.loader = loader self.metric = metric self.problem_doc = problem_doc preprocessing_blocks = self.pipeline_dict.get('preprocessing_blocks') if preprocessing_blocks: preprocessing = pipeline_dict.copy() preprocessing_primitives = preprocessing[ 'primitives'][:preprocessing_blocks] preprocessing['primitives'] = preprocessing_primitives self._preprocessing = preprocessing tunable = pipeline_dict.copy() tunable_primitives = tunable['primitives'][preprocessing_blocks:] tunable['primitives'] = tunable_primitives self._tunable = tunable pre_params, tun_params = self._extract_hyperparameters( preprocessing_primitives) self._preprocessing['hyperparameters'] = pre_params self._tunable['hyperparameters'] = tun_params else: self._preprocessing = None self._tunable = pipeline_dict self.id = str(uuid.uuid4()) self.cv_scores = list() self.rank = None self.score = None self.dumped = False self.fitted = False self.pipeline = MLPipeline.from_dict(pipeline_dict)
def score_pipeline(pipeline_metadata, n_splits=5, random_state=0, dataset=None): if isinstance(pipeline_metadata, str): LOGGER.info('Loading pipeline %s', pipeline_metadata) with open(pipeline_metadata, 'r') as pipeline_file: pipeline_metadata = json.load(pipeline_file) validation = pipeline_metadata['validation'] if dataset is None: dataset = validation['dataset'] LOGGER.info('Loading dataset %s', dataset) dataset = load_dataset(dataset) metric = validation.get('metric') metric_args = validation.get('metric_args', dict()) if metric: scorer = get_scorer(metric, metric_args) else: scorer = dataset.score metric = dataset.metric scores = list() splits = dataset.get_splits(n_splits, random_state) if n_splits == 1: splits = [splits] for split, (X_train, X_test, y_train, y_test) in enumerate(splits): LOGGER.info('Scoring split %s', split + 1) context = get_context(dataset, validation.get('context', dict())) pipeline = MLPipeline.from_dict(pipeline_metadata) pipeline.fit(X_train, y_train, **context) predictions = pipeline.predict(X_test, **context) score = scorer(y_test, predictions) LOGGER.info('Split %s %s: %s', split + 1, metric, score) scores.append(score) return np.mean(scores), np.std(scores)
def k_fold_validation(self, hyperparameters, X, y, scoring=None): """Score the pipeline through k-fold validation with the given scoring function. Args: hyperparameters (dict or None): A dictionary of hyper-parameters for each primitive in the target pipeline. X (pandas.DataFrame or ndarray): Inputs to the pipeline. y (pandas.Series or ndarray): Target values. scoring (str): The name of the scoring function. Returns: np.float64: The average score in the k-fold validation. """ model_instance = MLPipeline(self._pipeline) X = pd.DataFrame(X) y = pd.Series(y) if hyperparameters: model_instance.set_hyperparameters(hyperparameters) if self._problem_type == 'regression': scorer = self.regression_metrics[scoring or 'R2 Score'] else: scorer = self.classification_metrics[scoring or 'F1 Macro'] scores = [] kf = KFold(n_splits=10, random_state=None, shuffle=True) for train_index, test_index in kf.split(X): model_instance.fit(X.iloc[train_index], y.iloc[train_index]) y_pred = model_instance.predict(X.iloc[test_index]) scores.append(scorer(y.iloc[test_index], y_pred)) return np.mean(scores)
def load_pipeline(self, pipeline): LOGGER.info("Loading pipeline %s", pipeline.name) return MLPipeline.from_dict(pipeline.mlpipeline)
class GreenGuardPipeline(object): """Main Machine Learning component in the GreenGuard project. The ``GreenGuardPipeline`` represents the abstraction of a Machine Learning pipeline architecture specialized on the GreenGuard data format. In order to use it, an MLBlocks pipeline template needs to be given, alongside information about how to evaluate its performance using cross validation. Attributes: template (MLPipeline): MLPipeline instance used as the template for tuning. template_name: Name of the template being used. fitted (bool): Whether this GreenGuardPipeline has already been fitted or not. steps (list): List of primitives that compose this template. preprocessing (list): List of preprocessing steps. These steps have no learning stage and are executed only once on the complete training dataset, before partitioning it for cross validation. static (list): List of static steps. These are all the steps in the pipeline that come after the preprocessing ones but have no hyperparameters. These are executed on each cross validation split only once, when the data is partitioned, and their output is cached to be reused later on at every tuning iteration. tunable (list): List of steps that have hyperparameters and will be tuned during the tuning loop. Args: templates (str, MLPipeline or list): Template to use. If a ``str`` is given, load the corresponding ``MLPipeline``. Also can be a list combining both. metric (str or function): Metric to use. If an ``str`` is give it must be one of the metrics defined in the ``greenguard.metrics.METRICS`` dictionary. cost (bool): Whether the metric is a cost function (the lower the better) or not. Defaults to ``False``. init_params (dict or list): There are three possible values for init_params: * Init params ``dict``: It will be used for all templates. * ``dict`` with the name of the template as a key and dictionary with its init params. * ``list``: each value will be assigned to the corresponding position of self.templates. Defaults to ``None``. stratify (bool): Whether to stratify the data when partitioning for cross validation. Defaults to ``True``. cv_splits (int): Number of cross validation folds to use. Defaults to ``5``. shuffle (bool): Whether to shuffle the data when partitioning for cross validation. Defaults to ``True``. random_state (int or RandomState): random state to use for the cross validation partitioning. Defaults to ``0``. preprocessing (int, dict or list): There are three possible values for preprocessing: * ``int``: the value will be used for all templates. * ``dict`` with the template name as a key and a number as a value, will be used for that template. * ``list``: each value will be assigned to the corresponding position of self.templates. Defaults to ``0``. cache_path (str): If given, cache the generated cross validation splits in this folder. Defatuls to ``None``. """ template = None template_name = None fitted = False cv_score = None _cv_class = None _metric = None _cost = False _tuner = None _pipeline = None _static = None _init_params = None _preprocessing = None def _get_cv(self, stratify, cv_splits, shuffle, random_state): if stratify: cv_class = StratifiedKFold else: cv_class = KFold return cv_class(n_splits=cv_splits, shuffle=shuffle, random_state=random_state) def _set_hyperparameters(self, new_hyperparameters): self._hyperparameters = deepcopy(new_hyperparameters) def _set_template(self, template_name): self.template_name = template_name self.template = self._template_dicts[self.template_name] @staticmethod def _update_params(old, new): for name, params in new.items(): if '#' not in name: name = name + '#1' block_params = old.setdefault(name, dict()) for param, value in params.items(): block_params[param] = value def _count_static_steps(self, pipeline): tunable_hyperparams = pipeline.get_tunable_hyperparameters() for index, block_name in enumerate(pipeline.blocks.keys()): if tunable_hyperparams[block_name]: return index return 0 def _get_templates(self, templates): template_dicts = dict() template_names = list() for template in templates: if isinstance(template, str): template_name = template template = load_pipeline(template_name) else: template_name = md5(json.dumps(template)).digest() template_dicts[template_name] = template template_names.append(template_name) return template_names, template_dicts def _generate_init_params(self, init_params): if not init_params: self._init_params = {} elif isinstance(init_params, list): self._init_params = dict(zip(self._template_names, init_params)) elif any(name in init_params for name in self._template_names): self._init_params = init_params def _generate_preprocessing(self, preprocessing): if isinstance(preprocessing, int): self._preprocessing = {name: preprocessing for name in self._template_names} else: if isinstance(preprocessing, list): preprocessing = dict(zip(self._template_names, preprocessing)) self._preprocessing = { name: preprocessing.get(name, 0) for name in self._template_names } def _build_pipeline(self): self._pipeline = MLPipeline(self.template) if self._hyperparameters: self._pipeline.set_hyperparameters(self._hyperparameters) self.fitted = False def __init__(self, templates, metric='accuracy', cost=False, init_params=None, stratify=True, cv_splits=5, shuffle=True, random_state=0, preprocessing=0, cache_path=None): if isinstance(metric, str): metric, cost = METRICS[metric] self._metric = metric self._cost = cost self._cv = self._get_cv(stratify, cv_splits, shuffle, random_state) self.cv_score = np.inf if cost else -np.inf if not isinstance(templates, list): templates = [templates] self.templates = templates self._template_names, self._template_dicts = self._get_templates(templates) self._default_init_params = {} self._generate_init_params(init_params) for name, template in self._template_dicts.items(): init_params = self._init_params.get(name, self._default_init_params) template_params = template.setdefault('init_params', {}) self._update_params(template_params, init_params) self._generate_preprocessing(preprocessing) self._set_template(self._template_names[0]) self._hyperparameters = dict() self._build_pipeline() self._cache_path = cache_path if cache_path: os.makedirs(cache_path, exist_ok=True) def get_hyperparameters(self): """Get the current hyperparameters. Returns: dict: Current hyperparameters. """ return deepcopy(self._hyperparameters) def _is_better(self, score): if self._cost: return score < self.cv_score return score > self.cv_score def _generate_splits(self, template_name, target_times, readings, turbines=None): template = self._template_dicts.get(template_name) pipeline = MLPipeline(template) preprocessing = self._preprocessing.get(template_name) static = self._count_static_steps(pipeline) X = target_times[['turbine_id', 'cutoff_time']] y = target_times['target'] if preprocessing: if preprocessing > static: raise ValueError('Preprocessing cannot be bigger than static') LOGGER.debug('Running %s preprocessing steps', preprocessing) context = pipeline.fit(X=X, y=y, readings=readings, turbines=turbines, output_=preprocessing - 1) del context['X'] del context['y'] gc.collect() else: context = { 'readings': readings, 'turbines': turbines, } splits = list() for fold, (train_index, test_index) in enumerate(self._cv.split(X, y)): LOGGER.debug('Running static steps for fold %s', fold) gc.collect() X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] pipeline = MLPipeline(template) fit = pipeline.fit(X_train, y_train, output_=static - 1, start_=preprocessing, **context) predict = pipeline.predict(X_test, output_=static - 1, start_=preprocessing, **context) split = (fold, pipeline, fit, predict, y_test, static) if self._cache_path: split_name = '{}_{}.pkl'.format(template_name, fold) split_path = os.path.join(self._cache_path, split_name) with open(split_path, 'wb') as split_file: pickle.dump(split, split_file) split = split_path splits.append(split) gc.collect() return splits def _cross_validate(self, template_splits, hyperparams): scores = [] for split in template_splits: gc.collect() if self._cache_path: with open(split, 'rb') as split_file: split = pickle.load(split_file) fold, pipeline, fit, predict, y_test, static = split LOGGER.debug('Scoring fold %s', fold) pipeline.set_hyperparameters(hyperparams) pipeline.fit(start_=static, **fit) predictions = pipeline.predict(start_=static, **predict) score = self._metric(y_test, predictions) LOGGER.debug('Fold fold %s score: %s', fold, score) scores.append(score) return np.mean(scores) def _make_btb_scorer(self, target_times, readings, turbines): splits = {} def scorer(template_name, config): template_splits = splits.get(template_name) if template_splits is None: template_splits = self._generate_splits( template_name, target_times, readings, turbines) splits[template_name] = template_splits cv_score = self._cross_validate(template_splits, config) if self._is_better(cv_score): _config = '\n'.join(' {}: {}'.format(n, v) for n, v in config.items()) LOGGER.info(('New configuration found:\n' ' Template: %s \n' ' Hyperparameters: \n' '%s'), template_name, _config) self.cv_score = cv_score self._set_template(template_name) self._set_hyperparameters(config) self._build_pipeline() return cv_score return scorer def cross_validate(self, target_times, readings, turbines, template_name=None, hyperparams=None): """Compute cross validation score using the given data. If the splits have not been previously computed, compute them now. During this computation, the data is partitioned using the indicated cross validation parameters and later on processed using the pipeline static steps. The results of the fit and produce executions are cached and reused in subsequent calls to this method. Args: X (pandas.DataFrame): ``target_times`` data, without the ``target`` column. Only needed if the splits have not been previously computed. y (pandas.Series or numpy.ndarray): ``target`` vector corresponding to the passed ``target_times``. Only needed if the splits have not been previously computed. readings (pandas.DataFrame): ``readings`` table. Only needed if the splits have not been previously computed. turbines (pandas.DataFrame): ``turbines`` table. Only needed if the splits have not been previously computed. params (dict): hyperparameter values to use. Returns: float: Computed cross validation score. This score is the average of the scores obtained accross all the cross validation folds. """ if not template_name: template_name = self.template_name if hyperparams is None: hyperparams = self.get_hyperparameters() elif hyperparams is None: hyperparams = {} template_splits = self._generate_splits(template_name, target_times, readings, turbines) return self._cross_validate(template_splits, hyperparams) @classmethod def _get_tunables(cls, template_dicts): tunables = {} for name, template in template_dicts.items(): pipeline = MLPipeline(template) pipeline_tunables = pipeline.get_tunable_hyperparameters(flat=True) tunables[name] = Tunable.from_dict(pipeline_tunables) return tunables def tune(self, target_times, readings, turbines=None): """Create a tuning session object that tunes and selects the templates. Args: target_times (pandas.DataFrame): ``target_times`` table, containing the ``turbine_id``, ``cutoff_time`` and ``target`` columns. Only needed if the splits have not been previously computed. readings (pandas.DataFrame): ``readings`` table. Only needed if the splits have not been previously computed. turbines (pandas.DataFrame): ``turbines`` table. Only needed if the splits have not been previously computed. """ scoring_function = self._make_btb_scorer(target_times, readings, turbines) tunables = self._get_tunables(self._template_dicts) return BTBSession(tunables, scoring_function, maximize=not self._cost) def fit(self, target_times, readings, turbines=None): """Fit this pipeline to the given data. Args: target_times (pandas.DataFrame): ``target_times`` table, containing the ``turbine_id``, ``cutoff_time`` and ``target`` columns. readings (pandas.DataFrame): ``readings`` table. turbines (pandas.DataFrame): ``turbines`` table. """ X = target_times[['turbine_id', 'cutoff_time']] y = target_times['target'] self._pipeline.fit(X, y, readings=readings, turbines=turbines) self.fitted = True def predict(self, target_times, readings, turbines=None): """Make predictions using this pipeline. Args: target_times (pandas.DataFrame): ``target_times`` table, containing the ``turbine_id``, ``cutoff_time`` and ``target`` columns. readings (pandas.DataFrame): ``readings`` table. turbines (pandas.DataFrame): ``turbines`` table. Returns: numpy.ndarray: Vector of predictions. """ if not self.fitted: raise NotFittedError() X = target_times[['turbine_id', 'cutoff_time']] return self._pipeline.predict(X, readings=readings, turbines=turbines) def save(self, path): """Serialize and save this pipeline using cloudpickle. Args: path (str): Path to the file where the pipeline will be saved. """ with open(path, 'wb') as pickle_file: cloudpickle.dump(self, pickle_file) @classmethod def load(cls, path): """Load a previously saved pipeline from a file. Args: path (str): Path to the file where the pipeline is saved. Returns: GreenGuardPipeline: Loaded GreenGuardPipeline instance. """ with open(path, 'rb') as pickle_file: return cloudpickle.load(pickle_file)
def test_SigPro_nested_pipeline(): """Test nested sigpro primitive.""" # setup aggregations = [{ 'primitive': 'sigpro.SigPro', 'init_params': { 'keep_columns': True, 'input_is_dataframe': False, 'values_column_name': 'amplitude_values', 'transformations': [{ 'primitive': 'sigpro.transformations.frequency.band.frequency_band', 'init_params': { 'low': 100, 'high': 200 } }], 'aggregations': [{ 'primitive': 'sigpro.aggregations.amplitude.statistical.mean' }] } }, { 'primitive': 'sigpro.SigPro', 'init_params': { 'input_is_dataframe': False, 'values_column_name': 'amplitude_values', 'transformations': [{ 'primitive': 'sigpro.transformations.frequency.band.frequency_band', 'init_params': { 'low': 3000, 'high': 4000 } }], 'aggregations': [{ 'name': 'band_3k_4k_mean', 'primitive': 'sigpro.aggregations.amplitude.statistical.mean' }], } }, { 'primitive': 'sigpro.aggregations.amplitude.statistical.mean' }] pipeline = MLPipeline({ 'primitives': ['sigpro.SigPro'], 'init_params': { 'sigpro.SigPro#1': { 'transformations': [{ 'primitive': 'sigpro.transformations.frequency.fft.fft_real' }], 'aggregations': aggregations } } }) data = pd.DataFrame({ 'timestamp': pd.to_datetime(['2020-01-01 00:00:00']), 'values': [[1, 2, 3, 4, 5, 6]], 'sampling_frequency': [10000], 'dummy': [1], }) # run output = pipeline.predict(readings=data) outputs = dict(zip(pipeline.get_output_names(), output)) # assert expected_features = [ 'fft_real.SigPro.frequency_band.mean.mean_value', 'fft_real.SigPro.frequency_band.band_3k_4k_mean.mean_value', 'fft_real.mean.mean_value' ] assert outputs['feature_columns'] == expected_features expected_readings = pd.DataFrame({ 'fft_real.SigPro.frequency_band.mean.mean_value': [float('nan')], 'fft_real.SigPro.frequency_band.band_3k_4k_mean.mean_value': [-3.0], 'fft_real.mean.mean_value': [1.0], }) pd.testing.assert_frame_equal(expected_readings, outputs['readings'])
def _clone_pipeline(pipeline): return MLPipeline.from_dict(pipeline.to_dict())