def pipeline_ml_with_inputs_artifacts(): full_pipeline = Pipeline([ node( func=remove_stopwords, inputs=dict(data="data", stopwords="stopwords_from_nltk"), outputs="cleaned_data", tags=["training", "inference"], ), node( func=train_fun, inputs="cleaned_data", outputs="model", tags=["training"], ), node( func=predict_fun, inputs=["model", "cleaned_data"], outputs="predictions", tags=["inference"], ), ]) pipeline_ml_with_inputs_artifacts = pipeline_ml_factory( training=full_pipeline.only_nodes_with_tags("training"), inference=full_pipeline.only_nodes_with_tags("inference"), input_name="data", ) return pipeline_ml_with_inputs_artifacts
def pipeline_ml_obj(): def preprocess_fun(data): return data def fit_fun(data): return 2 def predict_fun(model, data): return data * model full_pipeline = Pipeline([ node( func=preprocess_fun, inputs="raw_data", outputs="data", tags=["inference", "training"], ), node(func=fit_fun, inputs="data", outputs="model", tags=["training"]), node( func=predict_fun, inputs=["data", "model"], outputs="predictions", tags=["inference"], ), ]) pipeline_ml_obj = pipeline_ml_factory( training=full_pipeline.only_nodes_with_tags("training"), inference=full_pipeline.only_nodes_with_tags("inference"), input_name="raw_data", ) return pipeline_ml_obj
def pipeline_ml_with_parameters(): def remove_stopwords(data, stopwords): return data def train_fun_hyperparam(data, hyperparam): return 2 def predict_fun(model, data): return data * model def convert_probs_to_pred(data, threshold): return (data > threshold) * 1 full_pipeline = Pipeline([ # almost the same that previsously but stopwords are parameters # this is a shared parameter between inference and training22 node( func=remove_stopwords, inputs=dict(data="data", stopwords="params:stopwords"), outputs="cleaned_data", tags=["training", "inference"], ), # parameters in training pipeline, should not be persisted node( func=train_fun_hyperparam, inputs=["cleaned_data", "params:penalty"], outputs="model", tags=["training"], ), node( func=predict_fun, inputs=["model", "cleaned_data"], outputs="predicted_probs", tags=["inference"], ), # this time, there is a parameter only for the inference pipeline node( func=convert_probs_to_pred, inputs=["predicted_probs", "params:threshold"], outputs="predictions", tags=["inference"], ), ]) pipeline_ml_with_parameters = pipeline_ml_factory( training=full_pipeline.only_nodes_with_tags("training"), inference=full_pipeline.only_nodes_with_tags("inference"), input_name="data", log_model_kwargs={ "conda_env": { "python": "3.7.0", "dependencies": ["kedro==0.16.5"] }, }, ) return pipeline_ml_with_parameters
def _filter_pipeline( self, pipeline: Pipeline, tags: Iterable[str] = None, from_nodes: Iterable[str] = None, to_nodes: Iterable[str] = None, node_names: Iterable[str] = None, from_inputs: Iterable[str] = None, ) -> Pipeline: """Filter the pipeline as the intersection of all conditions.""" new_pipeline = pipeline # We need to intersect with the pipeline because the order # of operations matters, so we don't want to do it incrementally. # As an example, with a pipeline of nodes 1,2,3, think of # "from 1", and "only 1 and 3" - the order you do them in results in # either 1 & 3, or just 1. if tags: new_pipeline &= pipeline.only_nodes_with_tags(*tags) if not new_pipeline.nodes: raise KedroContextError( "Pipeline contains no nodes with tags: {}".format( str(tags))) if from_nodes: new_pipeline &= pipeline.from_nodes(*from_nodes) if to_nodes: new_pipeline &= pipeline.to_nodes(*to_nodes) if node_names: new_pipeline &= pipeline.only_nodes(*node_names) if from_inputs: new_pipeline &= pipeline.from_inputs(*from_inputs) if not new_pipeline.nodes: raise KedroContextError("Pipeline contains no nodes") return new_pipeline
def pipeline_ml_with_intermediary_artifacts(): full_pipeline = Pipeline([ node( func=preprocess_fun, inputs="raw_data", outputs="data", tags=["training"], ), node( func=fit_encoder_fun, inputs="data", outputs="encoder", tags=["training"], ), node( func=apply_encoder_fun, inputs=["encoder", "data"], outputs="encoded_data", tags=["training", "inference"], ), node( func=train_fun, inputs="encoded_data", outputs="model", tags=["training"], ), node( func=predict_fun, inputs=["model", "encoded_data"], outputs="predictions", tags=["inference"], ), ]) pipeline_ml_with_tag = pipeline_ml( training=full_pipeline.only_nodes_with_tags("training"), inference=full_pipeline.only_nodes_with_tags("inference"), input_name="data", ) return pipeline_ml_with_tag
def pipeline_ml_with_parameters(): full_pipeline = Pipeline([ # almost the same that previsously but stopwords are parameters # this is a shared parameter between inference and training22 node( func=remove_stopwords, inputs=dict(data="data", stopwords="params:stopwords"), outputs="cleaned_data", tags=["training", "inference"], ), # parameters in training pipeline, should not be persisted node( func=train_fun_hyperparam, inputs=["cleaned_data", "params:penalty"], outputs="model", tags=["training"], ), node( func=predict_fun, inputs=["model", "cleaned_data"], outputs="predicted_probs", tags=["inference"], ), # this time, there is a parameter only for the inference pipeline node( func=convert_probs_to_pred, inputs=["predicted_probs", "params:threshold"], outputs="predictions", tags=["inference"], ), ]) pipeline_ml_with_parameters = pipeline_ml_factory( training=full_pipeline.only_nodes_with_tags("training"), inference=full_pipeline.only_nodes_with_tags("inference"), input_name="data", ) return pipeline_ml_with_parameters
def create_pipelines(*tags: str): example_pipeline = Pipeline( [ node( lambda x: x, "A", "B", name="node_1", tags=[ "apple", "orange", "banana", "lemon", "grape", "coconut", "fresh strawberries!", ], ), node( sum_dfs, ["B", "C"], "D", name="node_2", tags=["apple", "orange", "lemon"], ), node( identity, "D", "E", name="node_3", tags=["apple", "orange", "banana", "cherry"], ), node(identity, "D", "F", name="node_4", tags=["apple", "cherry"]), ] ) if tags: pipeline = Pipeline([]) for tag in tags: pipeline += example_pipeline.only_nodes_with_tags(tag) if not pipeline.nodes: raise ValueError( "Not found any nodes having any of the following " "tags attached: {}".format(", ".join(tags)) ) else: pipeline = example_pipeline return {"__default__": pipeline}