def data_science_pipeline() -> Pipeline: """Create the data science pipeline.""" return Pipeline( nodes=[ node( func=fit_pca, inputs={ "x": "primary_classified_x", "kwargs": "params:fit_pca", }, outputs={ "x": "model_output_pca_x", "variance": "model_output_pca_variance", }, name="fit-pca", tags="pca", ), node( func=fit_tsne, inputs={ "x": "primary_classified_x", "kwargs": "params:fit_tsne", }, outputs="model_output_tsne_x", name="fit-tsne", tags="tsne", ), ] )
def data_visualization_pipeline() -> Pipeline: """Create the data visualization pipeline.""" return Pipeline(nodes=[ node( func=plot_pca, inputs={ "x": "model_output_pca_x", "y": "primary_classified_y", "variance": "model_output_pca_variance", "metadata": "params:metadata", "kwargs": "params:plot_pca", }, outputs="reporting_pca", name="plot-pca", tags="pca", ), node( func=plot_tsne, inputs={ "x": "model_output_tsne_x", "y": "primary_classified_y", "metadata": "params:metadata", "kwargs": "params:plot_tsne", }, outputs="reporting_tsne", name="plot-tsne", tags="tsne", ), ])
def _validate_datasets_exist( inputs: AbstractSet[str], outputs: AbstractSet[str], parameters: AbstractSet[str], pipe: Pipeline, ) -> None: inputs = {_strip_transcoding(k) for k in inputs} outputs = {_strip_transcoding(k) for k in outputs} existing = {_strip_transcoding(ds) for ds in pipe.data_sets()} non_existent = (inputs | outputs | parameters) - existing if non_existent: raise ModularPipelineError( "Failed to map datasets and/or parameters: {}".format(", ".join( sorted(non_existent))))
def _validate_inputs_outputs( inputs: AbstractSet[str], outputs: AbstractSet[str], pipe: Pipeline ) -> None: """Safeguards to ensure that: - parameters are not specified under inputs - inputs are only free inputs - outputs do not contain free inputs """ inputs = {_strip_transcoding(k) for k in inputs} outputs = {_strip_transcoding(k) for k in outputs} if any(_is_parameter(i) for i in inputs): raise ModularPipelineError( "Parameters should be specified in the `parameters` argument" ) free_inputs = {_strip_transcoding(i) for i in pipe.inputs()} if not inputs <= free_inputs: raise ModularPipelineError("Inputs should be free inputs to the pipeline") if outputs & free_inputs: raise ModularPipelineError("Outputs can't contain free inputs to the pipeline")
def pipeline( pipe: Pipeline, *, inputs: Dict[str, str] = None, outputs: Dict[str, str] = None, parameters: Dict[str, str] = None, namespace: str = None, ) -> Pipeline: """Create a copy of the pipeline and its nodes, with some dataset names and node names modified. Args: pipe: Original modular pipeline to integrate inputs: A map of the existing input name to the new one. Must only refer to the pipeline's free inputs. outputs: A map of the existing output name to the new one. Can refer to both the pipeline's free outputs, as well as intermediate results that need to be exposed. parameters: A map of existing parameter to the new one. namespace: A prefix to give to all dataset names, except those explicitly named with the `inputs`/`outputs` arguments, and parameter references (`params:` and `parameters`). Raises: ModularPipelineError: When inputs, outputs or parameters are incorrectly specified, or they do not exist on the original pipeline. ValueError: When underlying pipeline nodes inputs/outputs are not any of the expected types (str, dict, list, or None). Returns: A new ``Pipeline`` object with the new nodes, modified as requested. """ # pylint: disable=protected-access inputs = copy.deepcopy(inputs) or {} outputs = copy.deepcopy(outputs) or {} parameters = copy.deepcopy(parameters) or {} _validate_datasets_exist(inputs.keys(), outputs.keys(), parameters.keys(), pipe) _validate_inputs_outputs(inputs.keys(), outputs.keys(), pipe) mapping = {**inputs, **outputs, **parameters} def _prefix(name: str) -> str: return f"{namespace}.{name}" if namespace else name def _is_transcode_base_in_mapping(name: str) -> bool: base_name, _ = _transcode_split(name) return base_name in mapping def _map_transcode_base(name: str): base_name, transcode_suffix = _transcode_split(name) return TRANSCODING_SEPARATOR.join( (mapping[base_name], transcode_suffix)) def _rename(name: str): rules = [ # if name mapped to new name, update with new name (lambda n: n in mapping, lambda n: mapping[n]), # if it's a parameter, leave as is (don't namespace) (_is_parameter, lambda n: n), # if transcode base is mapped to a new name, update with new base (_is_transcode_base_in_mapping, _map_transcode_base), # if namespace given, prefix name using that namespace (lambda n: bool(namespace), _prefix), ] for predicate, processor in rules: if predicate(name): return processor(name) # leave name as is return name def _process_dataset_names( datasets: Union[None, str, List[str], Dict[str, str]] ) -> Union[None, str, List[str], Dict[str, str]]: if datasets is None: return None if isinstance(datasets, str): return _rename(datasets) if isinstance(datasets, list): return [_rename(name) for name in datasets] if isinstance(datasets, dict): return {key: _rename(value) for key, value in datasets.items()} raise ValueError( # pragma: no cover f"Unexpected input {datasets} of type {type(datasets)}") def _copy_node(node: Node) -> Node: new_namespace = node.namespace if namespace: new_namespace = (f"{namespace}.{node.namespace}" if node.namespace else namespace) return node._copy( inputs=_process_dataset_names(node._inputs), outputs=_process_dataset_names(node._outputs), namespace=new_namespace, ) new_nodes = [_copy_node(n) for n in pipe.nodes] return Pipeline(new_nodes)
def data_engineering_pipeline() -> Pipeline: """Create the data engineering pipeline.""" return Pipeline(nodes=[ node( func=extract, inputs="raw_matlab_image", outputs="intermediate_image", name="extract-image", tags=["pca", "tsne", "tcn"], ), node( func=extract, inputs="raw_matlab_ground_truth", outputs="intermediate_ground_truth", name="extract-ground-truth", tags=["pca", "tsne", "tcn"], ), node( func=scale, inputs={ "image": "intermediate_image", "kwargs": "params:scale" }, outputs="scale_image", name="scale-image", tags=["pca", "tsne", "tcn"], ), node( func=separate, inputs={ "image": "scale_image", "ground_truth": "intermediate_ground_truth", }, outputs={ "classified_x": "primary_classified_x", "unclassified_x": "primary_unclassified_x", "classified_y": "primary_classified_y", "unclassified_y": "primary_unclassified_y", }, name="separate-classified-and-unclassified-samples", tags=["pca", "tsne", "tcn"], ), node( func=split, inputs={ "x": "primary_classified_x", "y": "primary_classified_y", "kwargs": "params:split", }, outputs={ "x_train": "model_input_classified_x_train", "x_test": "model_input_classified_x_test", "x_valid": "model_input_classified_x_valid", "y_train": "model_input_classified_y_train", "y_test": "model_input_classified_y_test", "y_valid": "model_input_classified_y_valid", }, name="split-dataset", tags="tcn", ), ])